add initial scraping and message concat
parent
180a5d2d12
commit
bd8af62765
@ -0,0 +1,7 @@
|
||||
beautifulsoup4==4.9.0
|
||||
certifi==2020.4.5.1
|
||||
chardet==3.0.4
|
||||
idna==2.9
|
||||
requests==2.23.0
|
||||
soupsieve==2.0
|
||||
urllib3==1.25.9
|
@ -0,0 +1,66 @@
|
||||
import requests
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_blog():
|
||||
url = 'https://news.blizzard.com/en-us/'
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
base_url = 'https://news.blizzard.com'
|
||||
|
||||
blog = []
|
||||
|
||||
feature_list_html = soup.find_all(class_='FeaturedArticle-link')
|
||||
for feature_html in feature_list_html:
|
||||
image_html = feature_html.find(class_='Card-image')
|
||||
image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0]
|
||||
image_url = 'https:'+image_url_fragment
|
||||
|
||||
text_list = feature_html.find_all(class_='text-truncate-ellipsis')
|
||||
|
||||
blog.append({
|
||||
'image': image_url,
|
||||
'game': text_list[0].contents[0],
|
||||
'title': text_list[1].contents[0],
|
||||
'description': '',
|
||||
'url': base_url+feature_html.attrs['href'],
|
||||
})
|
||||
|
||||
article_list_html = soup.find_all(class_='ArticleListItem')
|
||||
for article_html in article_list_html:
|
||||
image_html = article_html.find(class_='ArticleListItem-image')
|
||||
image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0]
|
||||
image_url = 'https:'+image_url_fragment
|
||||
|
||||
content_html = article_html.find(class_='ArticleListItem-contentGrid')
|
||||
|
||||
blog.append({
|
||||
'image': image_url,
|
||||
'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0],
|
||||
'title': content_html.find(class_='ArticleListItem-title').contents[0],
|
||||
'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0],
|
||||
'url': base_url+article_html.find(class_='ArticleLink').attrs['href'],
|
||||
})
|
||||
|
||||
return blog
|
||||
def get_body(post):
|
||||
return (
|
||||
post['title']+
|
||||
"\n"+
|
||||
post['description']+
|
||||
"\n"+
|
||||
post['url']
|
||||
)
|
||||
def get_formatted_body(post):
|
||||
return (
|
||||
'<a href="'+post['url']+'">'+
|
||||
'<h5>'+post['title']+'</h5>'+
|
||||
'</a>'+
|
||||
'<p>'+post['description']+'</p>'
|
||||
)
|
||||
|
||||
blog = get_blog()
|
||||
for post in blog:
|
||||
print(get_body(post))
|
||||
print(get_formatted_body(post))
|
Loading…
Reference in New Issue