|
|
|
@ -16,6 +16,13 @@ def get_accesstoken_from_file(accesstoken_path): |
|
|
|
|
accesstoken_file.close() |
|
|
|
|
|
|
|
|
|
return single_accesstoken |
|
|
|
|
def extract_image_url(image_html): |
|
|
|
|
# only recent articles use "" to escape the url, so we have to search for |
|
|
|
|
# with quotes and without quotes |
|
|
|
|
image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0] |
|
|
|
|
return 'https:'+image_url_fragment |
|
|
|
|
def sanitize_category(raw_category): |
|
|
|
|
return raw_category.replace(' ', '').replace(':', '').lower() |
|
|
|
|
def get_blog(): |
|
|
|
|
url = 'https://news.blizzard.com/en-us/' |
|
|
|
|
html = requests.get(url).text |
|
|
|
@ -26,13 +33,11 @@ def get_blog(): |
|
|
|
|
blog = [] |
|
|
|
|
|
|
|
|
|
for featured_article in soup.select('#featured-articles article'): |
|
|
|
|
image_html = featured_article.find(class_='Card-image') |
|
|
|
|
image_url_fragment = re.findall(r'url\("(.*?)"\)', image_html.attrs['style'])[0] |
|
|
|
|
image_url = 'https:'+image_url_fragment |
|
|
|
|
image_url = extract_image_url(featured_article.find(class_='Card-image')) |
|
|
|
|
|
|
|
|
|
text_list = featured_article.select('.text-truncate-ellipsis') |
|
|
|
|
category = text_list[0].contents[0].replace(' ', '').replace(':', '').lower() |
|
|
|
|
title = text_list[1].contents[0] |
|
|
|
|
category = sanitize_category(text_list[0].text) |
|
|
|
|
title = text_list[1].text |
|
|
|
|
|
|
|
|
|
url = base_url+featured_article.find('a').attrs['href'] |
|
|
|
|
|
|
|
|
@ -45,11 +50,9 @@ def get_blog(): |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
for recent_article in soup.select('#recent-articles article'): |
|
|
|
|
image_html = recent_article.find(class_='ArticleListItem-image') |
|
|
|
|
image_url_fragment = re.findall(r'url\((.*?)\)', image_html.attrs['style'])[0] |
|
|
|
|
image_url = 'https:'+image_url_fragment |
|
|
|
|
image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image')) |
|
|
|
|
|
|
|
|
|
category = recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text.replace(' ', '').replace(':', '').lower() |
|
|
|
|
category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text) |
|
|
|
|
title = recent_article.find(class_='ArticleListItem-title').text |
|
|
|
|
description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text |
|
|
|
|
|
|
|
|
|