add different output formats

main
lub 3 weeks ago
parent e40c1cbd3f
commit b3d8044157

@ -1,6 +1,6 @@
FROM python:3-slim
WORKDIR /data
WORKDIR /src
COPY requirements.txt ./

@ -1,15 +1,20 @@
import requests
from bs4 import BeautifulSoup
from time import time
from datetime import datetime
from http.server import BaseHTTPRequestHandler, HTTPServer
def scrape():
output = ''
output = {
'print': '',
'digital': ''
}
overview_url = 'https://gruene-hohenlohe.de/kalender'
overview_html = requests.get(overview_url, timeout=60).text
overview_soup = BeautifulSoup(overview_html, 'html.parser')
month = None
for href in overview_soup.select('.media-body h2 a'):
event_url = 'https://gruene-hohenlohe.de/' + href.attrs['href']
event_html = requests.get(event_url, timeout=60).text
@ -18,38 +23,95 @@ def scrape():
data = event_soup.select('.calendarize dl dd')
# date
output += data[0].text.strip()
output += ' '
date = data[0].text.strip()
output['print'] += date
output['print'] += ' '
# day of week
date_splitted = date.split('.')
year = int(date_splitted[2])
previous_month = month
if (month := int(date_splitted[1].lstrip('0'))) != previous_month:
output['digital'] += '<br>'
match month:
case 2:
output['digital'] += 'Februar'
case 12:
output['digital'] += 'Dezember'
output['digital'] += '<br>'
day = int(date_splitted[0].lstrip('0'))
match datetime(year, month, day).weekday():
case 0:
output['digital'] += 'Montag'
case 1:
output['digital'] += 'Dienstag'
case 2:
output['digital'] += 'Mittwoch'
case 3:
output['digital'] += 'Donnerstag'
case 4:
output['digital'] += 'Freitag'
case 5:
output['digital'] += 'Samstag'
case 6:
output['digital'] += 'Sonntag'
output['digital'] += ', '
output['digital'] += date
output['digital'] += ' '
# time
timespan = data[1].text.strip()
time = timespan.split(' ')[0]
output += time
time_formatted = time
if time != 'Ganztags':
output += ' Uhr'
output += ', '
time_formatted += ' Uhr'
output['print'] += time_formatted
output['print'] += ', '
output['digital'] += time_formatted
output['digital'] += ', '
# place
if len(data) > 2:
output += data[2].text.strip()
output += ', '
place = data[2].text.strip()
output['print'] += place
output['print'] += ', '
output['digital'] += place
output['digital'] += ', '
# title
output += '<b>'
output += event_soup.select('.calendarize h1')[0].text.strip()
output += '</b>'
title = event_soup.select('.calendarize h1')[0].text.strip()
output['print'] += '<b>'
output['print'] += title
output['print'] += '</b>'
output['digital'] += '<b>'
output['digital'] += title
output['digital'] += '</b>'
# description
try:
description = event_soup.select('.calendarize .text p')[0].text.strip()
output += ' '
output += description
output['print'] += ' '
output['print'] += description
output['digital'] += ' '
output['digital'] += description
except IndexError:
pass
output += '<br>'
output['print'] += '<br>'
return output
output['digital'] += '<br>'
return output['print'] + "<hr>" + output['digital']
class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
def do_GET(self):
@ -69,8 +131,8 @@ with open('template.html', 'r') as templateFile:
templateHtml = templateFile.read()
cache = {
"time": 0,
"output": None
'time': 0,
'output': None
}
httpd = HTTPServer(('', 8000), SimpleHTTPRequestHandler)

Loading…
Cancel
Save