refactor category and image into own functions

master
lub 2 years ago
parent fed4b15d08
commit 7990d787fc
  1. 21
      scrape.py

@ -16,6 +16,13 @@ def get_accesstoken_from_file(accesstoken_path):
accesstoken_file.close()
return single_accesstoken
def extract_image_url(image_html):
# only recent articles use "" to escape the url, so we have to search for
# with quotes and without quotes
image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0]
return 'https:'+image_url_fragment
def sanitize_category(raw_category):
return raw_category.replace(' ', '').replace(':', '').lower()
def get_blog():
url = 'https://news.blizzard.com/en-us/'
html = requests.get(url).text
@ -26,13 +33,11 @@ def get_blog():
blog = []
for featured_article in soup.select('#featured-articles article'):
image_html = featured_article.find(class_='Card-image')
image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0]
image_url = 'https:'+image_url_fragment
image_url = extract_image_url(featured_article.find(class_='Card-image'))
text_list = featured_article.select('.text-truncate-ellipsis')
category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower()
title = text_list[1].contents[0]
category = sanitize_category(text_list[0].text)
title = text_list[1].text
url = base_url+featured_article.find('a').attrs['href']
@ -45,11 +50,9 @@ def get_blog():
})
for recent_article in soup.select('#recent-articles article'):
image_html = recent_article.find(class_='ArticleListItem-image')
image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0]
image_url = 'https:'+image_url_fragment
image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image'))
category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower()
category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text)
title = recent_article.find(class_='ArticleListItem-title').text
description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text

Loading…
Cancel
Save