From 7990d787fc2612f83da108fbaa3ac764b757560e Mon Sep 17 00:00:00 2001
From: lub <git@lubiland.de>
Date: Sun, 12 Jul 2020 12:42:04 +0200
Subject: [PATCH] refactor category and image into own functions

---
 scrape.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/scrape.py b/scrape.py
index a30d7f5..3e0c0c7 100644
--- a/scrape.py
+++ b/scrape.py
@@ -16,6 +16,13 @@ def get_accesstoken_from_file(accesstoken_path):
     accesstoken_file.close()
 
     return single_accesstoken
+def extract_image_url(image_html):
+    # only recent articles use "" to escape the url, so we have to search for
+    # with quotes and without quotes
+    image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0]
+    return 'https:'+image_url_fragment
+def sanitize_category(raw_category):
+    return raw_category.replace(' ', '').replace(':', '').lower()
 def get_blog():
     url = 'https://news.blizzard.com/en-us/'
     html = requests.get(url).text
@@ -26,13 +33,11 @@ def get_blog():
     blog = []
 
     for featured_article in soup.select('#featured-articles article'):
-        image_html = featured_article.find(class_='Card-image')
-        image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0]
-        image_url = 'https:'+image_url_fragment
+        image_url = extract_image_url(featured_article.find(class_='Card-image'))
 
         text_list = featured_article.select('.text-truncate-ellipsis')
-        category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower()
-        title = text_list[1].contents[0]
+        category = sanitize_category(text_list[0].text)
+        title = text_list[1].text
 
         url = base_url+featured_article.find('a').attrs['href']
 
@@ -45,11 +50,9 @@ def get_blog():
         })
 
     for recent_article in soup.select('#recent-articles article'):
-        image_html = recent_article.find(class_='ArticleListItem-image')
-        image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0]
-        image_url = 'https:'+image_url_fragment
+        image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image'))
 
-        category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower()
+        category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text)
         title = recent_article.find(class_='ArticleListItem-title').text
         description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text