From 0aa90f86d6974553d74bb4bde26acd1afcdee609 Mon Sep 17 00:00:00 2001 From: lub Date: Tue, 2 Jul 2024 22:51:57 +0200 Subject: [PATCH] add some rudimentary webserver --- scrape.py | 72 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/scrape.py b/scrape.py index 815b06b..3f129a8 100644 --- a/scrape.py +++ b/scrape.py @@ -1,46 +1,60 @@ import requests from bs4 import BeautifulSoup +from http.server import BaseHTTPRequestHandler, HTTPServer overview_url = 'https://gruene-hohenlohe.de/kalender' overview_html = requests.get(overview_url, timeout=60).text overview_soup = BeautifulSoup(overview_html, 'html.parser') -for href in overview_soup.select('.media-body h2 a'): - event_url = 'https://gruene-hohenlohe.de/' + href.attrs['href'] - event_html = requests.get(event_url, timeout=60).text - event_soup = BeautifulSoup(event_html, 'html.parser') +def scrape(): + for href in overview_soup.select('.media-body h2 a'): + event_url = 'https://gruene-hohenlohe.de/' + href.attrs['href'] + event_html = requests.get(event_url, timeout=60).text + event_soup = BeautifulSoup(event_html, 'html.parser') - data = event_soup.select('.calendarize dl dd') + data = event_soup.select('.calendarize dl dd') - output = '' + output = '' - # date - output += data[0].text.strip() - output += ' ' + # date + output += data[0].text.strip() + output += ' ' - # time - timespan = data[1].text.strip() - time = timespan.split(' ')[0] + # time + timespan = data[1].text.strip() + time = timespan.split(' ')[0] - output += time - output += ' Uhr' - output += ', ' + output += time + output += ' Uhr' + output += ', ' - # place - output += data[2].text.strip() - output += ', ' + # place + #output += data[2].text.strip() + output += ', ' - # title - #output += '' - #output += event_soup.select('.calendarize h1')[0].text.strip() - #output += '' + # title + output += '' + output += event_soup.select('.calendarize h1')[0].text.strip() + output += '' - #try: - description = event_soup.select('.calendarize .text p')[0].text.strip() - #output += ' ' - output += description - #except IndexError: - # pass + try: + description = event_soup.select('.calendarize .text p')[0].text.strip() + output += ' ' + output += description + except IndexError: + pass - print(output) + return output + + + + +class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.end_headers() + self.wfile.write(scrape().encode('utf-8')) + +httpd = HTTPServer(('', 8000), SimpleHTTPRequestHandler) +httpd.serve_forever()