Petitionscraper
Für Petitions For Future habe ich einen Scraper für die Petitionsseiten geschrieben, damit man schneller an Informationen wie die Forderung, den Initiator, Unterschriftenziel, etc. kommt.
Dazu habe ich dieses Python-Skript geschrieben:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
parser = {}
def openpetition(soup):
results = {}
try:
results["startdate"] = re.search("Gestartet (\d\d\.\d\d\.\d\d\d\d)", "".join(soup.stripped_strings))[1]
except Exception:
results["startdate"] = soup.find(class_="status-step-list").find("li").span.string.replace("Gestartet ", "")
results["enddate"] = re.search("Sammlung bis (\d\d\.\d\d\.\d\d\d\d)", "".join([x["title"] for x in soup.find_all(title=True)]))[1]
results["goal"] = soup.find(class_=re.compile("^goal theme-text-variant-5")).strong.string.strip()
results["initiator"] = soup.find(class_="initiator-name").string.strip()
results["addressee"] = soup.find(class_="addressee-name").string.strip()
results["description"] = soup.head.find(name="meta", property="og:description")["content"].strip()
results["signer"] = soup.find(class_="signer-information-box").span.strong.string
return results
parser["openpetition.de"] = openpetition
def bundestag(soup):
results = {}
results["startdate"] = re.search("\d\d\.\d\d\.\d\d\d\d", soup.find(text="Erstellungsdatum").parent.parent.get_text())[0]
results["enddate"] = re.search("\d\d\.\d\d\.\d\d\d\d", soup.find(text="Mitzeichnungsfrist").parent.parent.get_text())[0]
results["goal"] = "50.000"
results["addressee"] = "Deutscher Bundestag"
results["signer"] = soup.find(class_="mzanzahl").string.strip()
results["description"] = soup.find(class_="titel-begruedung-container").find_all("p")[1].get_text()
return results
parser["epetitionen.bundestag.de"] = bundestag
def campact(soup):
results = {}
results["startdate"] = soup.find_all("abbr","timeago")[-1]["datetime"]
results["goal"] = int(re.search("\d+",soup.find("div", "stat-label").string.strip())[0])
results["initiator"] = soup.find("div", "name").string
results["addressee"] = soup.find("h2", "who").get_text().strip().replace("An:\n\n", "")
results["signer"] = int(soup.find("div", "stat-number").span.string)
results["description"] = soup.find("div", "what").get_text()
return results
parser["weact.campact.de"] = campact
def actionnetwork(soup):
results = {}
results["goal"] = int(re.search("\d+,\d+$", soup.find("div", "action_status_goal").string.strip())[0].replace(",", ""))
results["initiator"] = soup.find("div", "action_owner").get_text().strip()
results["addressee"] = soup.find("h4", "action_target").get_text().strip()
results["signer"] = int(re.search("\d+,\d+", soup.find("div", "action_status_running_total").string)[0].replace(",", ""))
results["description"] = soup.find_all("meta", property="og:description")[0]["content"].strip()
return results
parser["actionnetwork.org"] = actionnetwork
def regenwald(soup):
results = {}
results["title"] = soup.find("h1").string
results["goal"] = re.search("(\d+\.?)+", soup.find("p", "goal").string)[0]
results["initiator"] = "Rettet den Regenwald e.V."
results["addressee"] = soup.find("p", "recipient").string.replace("An: ", "")
results["signer"] = soup.find("div", id="petition-form").strong.get_text().strip().replace(" Teilnehmer", "")
results["description"] = soup.find("p", "intro").string
return results
parser["regenwald.org"] = regenwald
def act350org(soup):
results = {}
results["title"] = soup.find("h2", "title3").span.string
results["description"] = soup.find("div", id="action-description").get_text().strip()
results["initiator"] = "350.org"
return results
parser["act.350.org"] = act350org
def greenpeace(soup):
results = {}
results["title"] = soup.find("title").string
results["description"] = soup.find("meta", {"name":"description"})["content"]
results["initiator"] = "Greenpeace e. V."
results["url"] = soup.find("link", rel="canonical")["href"]
return results
parser["act.greenpeace.de"] = greenpeace
url = sys.argv[1]
if url == "about:parser":
print(json.dumps(list(parser.keys())))
exit()
domain = re.match("https?://(?:www\.)?(\w+\.?\w+\.(?:de|eu|org))", url)[1]
if domain in parser:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
print(json.dumps(parser[domain](soup), indent=4, ensure_ascii=False, sort_keys=True))
elif domain == "change.org":
print(json.dumps("change.org is not supported: https://bigbrotherawards.de/2016/wirtschaft-changeorg"))
else:
print(json.dumps(f"{domain} is not supported yet, please send me an email to petitionscraper@u1l.de"))
Dann, um es zugänglicher zu machen und damit ich nach und nach mehr Seiten hinzuzufügen kann, schrieb ich ein weiteres kleines Go-Programm, um das Python-Skript in eine API zu verwandeln:
package main
import (
"net/http"
"os/exec"
"fmt"
"bytes"
"log"
)
func main() {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
url := r.FormValue("url")
if url != "" {
cmd := exec.Command("python3.9", "petitionscraper.py", url)
var out bytes.Buffer
cmd.Stdout = &out
cmd.Run()
w.Header().Add("Content-Type", "application/json")
fmt.Fprint(w, out.String())
} else {
fmt.Fprint(w, "Send a GET request with url as a parameter set to the url you want to get information from")
}
})
log.Fatal(http.ListenAndServe(":8084", nil))
}
Auf petitonscraper.u1l.de kann man den Scraper nun benutzen. Idealerweise sollten alle Petitionsseiten Microformats unterstützen, so dass keine neue Funktion für jede Seite erforderlich ist.