refactor category and image into own functions

6 years ago · 7990d787fc
parent fed4b15d08
commit 7990d787fc
1 changed files with 12 additions and 9 deletions
--- a/scrape.py
+++ b/scrape.py
@ -16,6 +16,13 @@ def get_accesstoken_from_file(accesstoken_path):
    accesstoken_file.close()

    return single_accesstoken
+def extract_image_url(image_html):
+    # only recent articles use "" to escape the url, so we have to search for
+    # with quotes and without quotes
+    image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0]
+    return 'https:'+image_url_fragment
+def sanitize_category(raw_category):
+    return raw_category.replace(' ', '').replace(':', '').lower()
 def get_blog():
    url = 'https://news.blizzard.com/en-us/'
    html = requests.get(url).text
@ -26,13 +33,11 @@ def get_blog():
    blog = []

    for featured_article in soup.select('#featured-articles article'):
-        image_html = featured_article.find(class_='Card-image')
-        image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0]
-        image_url = 'https:'+image_url_fragment
+        image_url = extract_image_url(featured_article.find(class_='Card-image'))

        text_list = featured_article.select('.text-truncate-ellipsis')
-        category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower()
-        title = text_list[1].contents[0]
+        category = sanitize_category(text_list[0].text)
+        title = text_list[1].text

        url = base_url+featured_article.find('a').attrs['href']

@ -45,11 +50,9 @@ def get_blog():
        })

    for recent_article in soup.select('#recent-articles article'):
-        image_html = recent_article.find(class_='ArticleListItem-image')
-        image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0]
-        image_url = 'https:'+image_url_fragment
+        image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image'))

-        category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower()
+        category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text)
        title = recent_article.find(class_='ArticleListItem-title').text
        description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text