[Python] 구글 뉴스 클리핑
우리가 구글에서 “파이썬”이라고 검색을 하고 URL을 보면 다음과 같이 나온다.
/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&hl=ko&gl=KR&ceid=KR%3Ako
import requests
from bs4 import BeautifulSoup
base_url = "https://news.google.com"
search_url = base_url + "/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&hl=ko&gl=KR&ceid=KR%3Ako"
resp = requests.get(search_url)
html_src = resp.text
soup = BeautifulSoup(html_src, 'html.parser')
news_items = soup.select('div[class="xrnccd"]')
print(len(news_items))
print(news_items[0])
print("\n")
for item in news_items[:3]:
link = item.find('a', attrs={'class':'VDXfz'}).get('href')
news_link = base_url + link[1:]
print(news_link)
news_title = item.find('a', attrs={'class':'DY5T1d'}).getText()
print(news_title)
news_content = item.find('span', attrs={'class':'xBbh9'}).text
print(news_content)
news_agency = item.find('a', attrs={'class':'wEwyrc AVN2gc uQIVzc Sksgp'}).text
print(news_agency)
news_reporting = item.find('time', attrs={'class':'WW6dff uQIVzc Sksgp'})
news_reporting_datetime = news_reporting.get('datetime').split('T')
news_reporting_date = news_reporting_datetime[0]
news_reporting_time = news_reporting_datetime[1][:-1]
print(news_reporting_date, news_reporting_time)
print("\n")
def google_news_clipping(url, limit=5):
resp = requests.get(url)
html_src = resp.text
soup = BeautifulSoup(html_src, 'html.parser')
news_items = soup.select('div[class="xrnccd"]')
links=[]; titles=[]; contents=[]; agencies=[]; reporting_dates=[]; reporting_times=[];
for item in news_items[:limit]:
# Get new link
link = item.find('a', attrs={'class':'VDXfz'}).get('href')
news_link = base_url + link[1:]
links.append(news_link)
# Get new title
news_title = item.find('a', attrs={'class':'DY5T1d'}).getText()
titles.append(news_title)
# Get new contents
news_content = item.find('span', attrs={'class':'xBbh9'}).text
contents.append(news_content)
news_agency = item.find('a', attrs={'class':'wEwyrc AVN2gc uQIVzc Sksgp'}).text
agencies.append(news_agency)
news_reporting = item.find('time', attrs={'class':'WW6dff uQIVzc Sksgp'})
news_reporting_datetime = news_reporting.get('datetime').split('T')
news_reporting_date = news_reporting_datetime[0]
news_reporting_time = news_reporting_datetime[1][:-1]
reporting_dates.append(news_reporting_date)
reporting_times.append(news_reporting_time)
# Return dictionary type
result = {'link':links, 'title':titles, 'contents':contents, 'agency':agencies, \
'date':reporting_dates, 'time':reporting_times}
return result
news = google_news_clipping(search_url, 2)
print(news)