my avatar. It shoud reassemble a face ulPa

Petitionscraper

Für Petitions For Future habe ich einen Scraper für die Petitionsseiten geschrieben, damit man schneller an Informationen wie die Forderung, den Initiator, Unterschriftenziel, etc. kommt.

Dazu habe ich dieses Python-Skript geschrieben:

import re
import sys
import json
import requests
from bs4 import BeautifulSoup

parser = {}

def openpetition(soup):
    results = {}
    try:
        results["startdate"] = re.search("Gestartet (\d\d\.\d\d\.\d\d\d\d)", "".join(soup.stripped_strings))[1]
    except Exception:
        results["startdate"] = soup.find(class_="status-step-list").find("li").span.string.replace("Gestartet ", "")
    results["enddate"] = re.search("Sammlung bis (\d\d\.\d\d\.\d\d\d\d)", "".join([x["title"] for x in soup.find_all(title=True)]))[1]
    results["goal"] = soup.find(class_=re.compile("^goal theme-text-variant-5")).strong.string.strip()
    results["initiator"] = soup.find(class_="initiator-name").string.strip()
    results["addressee"] = soup.find(class_="addressee-name").string.strip()
    results["description"] = soup.head.find(name="meta", property="og:description")["content"].strip()
    results["signer"] = soup.find(class_="signer-information-box").span.strong.string
    return results
parser["openpetition.de"] = openpetition

def bundestag(soup):
    results = {}
    results["startdate"] = re.search("\d\d\.\d\d\.\d\d\d\d", soup.find(text="Erstellungsdatum").parent.parent.get_text())[0]
    results["enddate"] = re.search("\d\d\.\d\d\.\d\d\d\d", soup.find(text="Mitzeichnungsfrist").parent.parent.get_text())[0]
    results["goal"] = "50.000"
    results["addressee"] = "Deutscher Bundestag"
    results["signer"] = soup.find(class_="mzanzahl").string.strip()
    results["description"] = soup.find(class_="titel-begruedung-container").find_all("p")[1].get_text()
    return results
parser["epetitionen.bundestag.de"] = bundestag

def campact(soup):
    results = {}
    results["startdate"] = soup.find_all("abbr","timeago")[-1]["datetime"]
    results["goal"] = int(re.search("\d+",soup.find("div", "stat-label").string.strip())[0])
    results["initiator"] = soup.find("div", "name").string
    results["addressee"] = soup.find("h2", "who").get_text().strip().replace("An:\n\n", "")
    results["signer"] = int(soup.find("div", "stat-number").span.string)
    results["description"] = soup.find("div", "what").get_text()
    return results
parser["weact.campact.de"] = campact

def actionnetwork(soup):
    results = {}
    results["goal"] = int(re.search("\d+,\d+$", soup.find("div", "action_status_goal").string.strip())[0].replace(",", ""))
    results["initiator"] = soup.find("div", "action_owner").get_text().strip()
    results["addressee"] = soup.find("h4", "action_target").get_text().strip()
    results["signer"] = int(re.search("\d+,\d+", soup.find("div", "action_status_running_total").string)[0].replace(",", ""))
    results["description"] = soup.find_all("meta", property="og:description")[0]["content"].strip()
    return results    
parser["actionnetwork.org"] = actionnetwork

def regenwald(soup):
    results = {}
    results["title"] = soup.find("h1").string
    results["goal"] = re.search("(\d+\.?)+", soup.find("p", "goal").string)[0]
    results["initiator"] = "Rettet den Regenwald e.V."
    results["addressee"] = soup.find("p", "recipient").string.replace("An: ", "")
    results["signer"] = soup.find("div", id="petition-form").strong.get_text().strip().replace(" Teilnehmer", "")
    results["description"] = soup.find("p", "intro").string
    return results
parser["regenwald.org"] = regenwald

def act350org(soup):
    results = {}
    results["title"] = soup.find("h2", "title3").span.string
    results["description"] = soup.find("div", id="action-description").get_text().strip()
    results["initiator"] = "350.org"
    return results
parser["act.350.org"] = act350org

def greenpeace(soup):
    results = {}
    results["title"] = soup.find("title").string
    results["description"] = soup.find("meta", {"name":"description"})["content"]
    results["initiator"] = "Greenpeace e. V."
    results["url"] = soup.find("link", rel="canonical")["href"]
    return results
parser["act.greenpeace.de"] = greenpeace

url = sys.argv[1]

if url == "about:parser":
    print(json.dumps(list(parser.keys())))
    exit()

domain = re.match("https?://(?:www\.)?(\w+\.?\w+\.(?:de|eu|org))", url)[1]
if domain in parser:
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    print(json.dumps(parser[domain](soup), indent=4, ensure_ascii=False, sort_keys=True))
elif domain == "change.org":
    print(json.dumps("change.org is not supported: https://bigbrotherawards.de/2016/wirtschaft-changeorg"))
else:
    print(json.dumps(f"{domain} is not supported yet, please send me an email to petitionscraper@u1l.de"))

Dann, um es zugänglicher zu machen und damit ich nach und nach mehr Seiten hinzuzufügen kann, schrieb ich ein weiteres kleines Go-Programm, um das Python-Skript in eine API zu verwandeln:

package main

import (
	"net/http"
        "os/exec"
	"fmt"
	"bytes"
	"log"
)


func main() {
	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		url := r.FormValue("url")
		if  url != "" {
			cmd := exec.Command("python3.9", "petitionscraper.py", url)
			var out bytes.Buffer
			cmd.Stdout = &out
			cmd.Run()
                        w.Header().Add("Content-Type", "application/json")
			fmt.Fprint(w, out.String())
		} else {
			fmt.Fprint(w, "Send a GET request with url as a parameter set to the url you want to get information from")
		}
	})
	log.Fatal(http.ListenAndServe(":8084", nil))
}

Auf petitonscraper.u1l.de kann man den Scraper nun benutzen. Idealerweise sollten alle Petitionsseiten Microformats unterstützen, so dass keine neue Funktion für jede Seite erforderlich ist.

0.01g of CO2/viewWebsite Carbon
 Cleaner than 98% of pages tested4

Letzter Render: 14.12.2022 20:55