From fed4b15d0815ba89250095fc0b600f0564fc8d32 Mon Sep 17 00:00:00 2001
From: lub <git@lubiland.de>
Date: Sun, 12 Jul 2020 12:30:03 +0200
Subject: [PATCH] fix scraping

---
 scrape.py | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/scrape.py b/scrape.py
index 972438e..a30d7f5 100644
--- a/scrape.py
+++ b/scrape.py
@@ -25,36 +25,42 @@ def get_blog():
 
     blog = []
 
-    feature_list_html = soup.find_all(class_='FeaturedArticle-link')
-    for feature_html in feature_list_html:
-        image_html = feature_html.find(class_='Card-image')
-        image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0]
+    for featured_article in soup.select('#featured-articles article'):
+        image_html = featured_article.find(class_='Card-image')
+        image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0]
         image_url = 'https:'+image_url_fragment
 
-        text_list = feature_html.find_all(class_='text-truncate-ellipsis')
+        text_list = featured_article.select('.text-truncate-ellipsis')
+        category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower()
+        title = text_list[1].contents[0]
+
+        url = base_url+featured_article.find('a').attrs['href']
 
         blog.append({
             'image': image_url,
-            'category': text_list[0].contents[0].replace(' ', '').replace(':', '').lower(),
-            'title': text_list[1].contents[0],
-            'description': '',
-            'url': base_url+feature_html.attrs['href'],
+            'category': category,
+            'title': title,
+            'description': '', # featured articles don't have a description
+            'url': url,
         })
 
-    article_list_html = soup.find_all(class_='ArticleListItem')
-    for article_html in article_list_html:
-        image_html = article_html.find(class_='ArticleListItem-image')
-        image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0]
+    for recent_article in soup.select('#recent-articles article'):
+        image_html = recent_article.find(class_='ArticleListItem-image')
+        image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0]
         image_url = 'https:'+image_url_fragment
 
-        content_html = article_html.find(class_='ArticleListItem-contentGrid')
+        category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower()
+        title = recent_article.find(class_='ArticleListItem-title').text
+        description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text
+
+        url = base_url+recent_article.find('a').attrs['href']
 
         blog.append({
             'image': image_url,
-            'category': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0].replace(' ', '').replace(':', '').lower(),
-            'title': content_html.find(class_='ArticleListItem-title').contents[0],
-            'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0],
-            'url': base_url+article_html.find(class_='ArticleLink').attrs['href'],
+            'category': category,
+            'title': title,
+            'description': description,
+            'url': url
         })
 
     # reverse order so the oldest article is at [0]