From b3d80441576eb95386905058eb5e30c33dcc92e2 Mon Sep 17 00:00:00 2001 From: lub Date: Mon, 2 Dec 2024 17:36:42 +0100 Subject: [PATCH] add different output formats --- Dockerfile | 2 +- scrape.py | 98 ++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2fedd5d..a5568cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3-slim -WORKDIR /data +WORKDIR /src COPY requirements.txt ./ diff --git a/scrape.py b/scrape.py index 7f35deb..fdcaf34 100644 --- a/scrape.py +++ b/scrape.py @@ -1,15 +1,20 @@ import requests from bs4 import BeautifulSoup from time import time +from datetime import datetime from http.server import BaseHTTPRequestHandler, HTTPServer def scrape(): - output = '' + output = { + 'print': '', + 'digital': '' + } overview_url = 'https://gruene-hohenlohe.de/kalender' overview_html = requests.get(overview_url, timeout=60).text overview_soup = BeautifulSoup(overview_html, 'html.parser') + month = None for href in overview_soup.select('.media-body h2 a'): event_url = 'https://gruene-hohenlohe.de/' + href.attrs['href'] event_html = requests.get(event_url, timeout=60).text @@ -18,38 +23,95 @@ def scrape(): data = event_soup.select('.calendarize dl dd') # date - output += data[0].text.strip() - output += ' ' + date = data[0].text.strip() + + output['print'] += date + output['print'] += ' ' + + # day of week + date_splitted = date.split('.') + year = int(date_splitted[2]) + previous_month = month + if (month := int(date_splitted[1].lstrip('0'))) != previous_month: + output['digital'] += '
' + match month: + case 2: + output['digital'] += 'Februar' + case 12: + output['digital'] += 'Dezember' + output['digital'] += '
' + day = int(date_splitted[0].lstrip('0')) + + match datetime(year, month, day).weekday(): + case 0: + output['digital'] += 'Montag' + case 1: + output['digital'] += 'Dienstag' + case 2: + output['digital'] += 'Mittwoch' + case 3: + output['digital'] += 'Donnerstag' + case 4: + output['digital'] += 'Freitag' + case 5: + output['digital'] += 'Samstag' + case 6: + output['digital'] += 'Sonntag' + output['digital'] += ', ' + output['digital'] += date + output['digital'] += ' ' # time timespan = data[1].text.strip() time = timespan.split(' ')[0] - - output += time + time_formatted = time if time != 'Ganztags': - output += ' Uhr' - output += ', ' + time_formatted += ' Uhr' + + output['print'] += time_formatted + output['print'] += ', ' + + output['digital'] += time_formatted + output['digital'] += ', ' # place if len(data) > 2: - output += data[2].text.strip() - output += ', ' + place = data[2].text.strip() + + output['print'] += place + output['print'] += ', ' + + output['digital'] += place + output['digital'] += ', ' # title - output += '' - output += event_soup.select('.calendarize h1')[0].text.strip() - output += '' + title = event_soup.select('.calendarize h1')[0].text.strip() + output['print'] += '' + output['print'] += title + output['print'] += '' + + output['digital'] += '' + output['digital'] += title + output['digital'] += '' + + # description try: description = event_soup.select('.calendarize .text p')[0].text.strip() - output += ' ' - output += description + + output['print'] += ' ' + output['print'] += description + + output['digital'] += ' ' + output['digital'] += description except IndexError: pass - output += '
' + output['print'] += '
' - return output + output['digital'] += '
' + + return output['print'] + "
" + output['digital'] class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): def do_GET(self): @@ -69,8 +131,8 @@ with open('template.html', 'r') as templateFile: templateHtml = templateFile.read() cache = { - "time": 0, - "output": None + 'time': 0, + 'output': None } httpd = HTTPServer(('', 8000), SimpleHTTPRequestHandler)