diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ad514b3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.9.0 +certifi==2020.4.5.1 +chardet==3.0.4 +idna==2.9 +requests==2.23.0 +soupsieve==2.0 +urllib3==1.25.9 diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..277e7cd --- /dev/null +++ b/scrape.py @@ -0,0 +1,66 @@ +import requests +import re +from bs4 import BeautifulSoup + +def get_blog(): + url = 'https://news.blizzard.com/en-us/' + html = requests.get(url).text + soup = BeautifulSoup(html, 'html.parser') + + base_url = 'https://news.blizzard.com' + + blog = [] + + feature_list_html = soup.find_all(class_='FeaturedArticle-link') + for feature_html in feature_list_html: + image_html = feature_html.find(class_='Card-image') + image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0] + image_url = 'https:'+image_url_fragment + + text_list = feature_html.find_all(class_='text-truncate-ellipsis') + + blog.append({ + 'image': image_url, + 'game': text_list[0].contents[0], + 'title': text_list[1].contents[0], + 'description': '', + 'url': base_url+feature_html.attrs['href'], + }) + + article_list_html = soup.find_all(class_='ArticleListItem') + for article_html in article_list_html: + image_html = article_html.find(class_='ArticleListItem-image') + image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0] + image_url = 'https:'+image_url_fragment + + content_html = article_html.find(class_='ArticleListItem-contentGrid') + + blog.append({ + 'image': image_url, + 'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0], + 'title': content_html.find(class_='ArticleListItem-title').contents[0], + 'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0], + 'url': base_url+article_html.find(class_='ArticleLink').attrs['href'], + }) + + return blog +def get_body(post): + return ( + post['title']+ + "\n"+ + post['description']+ + "\n"+ + post['url'] + ) +def get_formatted_body(post): + return ( + ''+ + '
'+post['title']+'
'+ + '
'+ + '

'+post['description']+'

' + ) + +blog = get_blog() +for post in blog: + print(get_body(post)) + print(get_formatted_body(post))