Programming,  Python

[Python] 구글 뉴스 클리핑

우리가 구글에서 “파이썬”이라고 검색을 하고 URL을 보면 다음과 같이 나온다.

/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&hl=ko&gl=KR&ceid=KR%3Ako

import requests
from bs4 import BeautifulSoup

base_url = "https://news.google.com"
search_url = base_url + "/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&hl=ko&gl=KR&ceid=KR%3Ako"
resp = requests.get(search_url)
html_src = resp.text
soup = BeautifulSoup(html_src, 'html.parser')

news_items = soup.select('div[class="xrnccd"]')
print(len(news_items))
print(news_items[0])
print("\n")

for item in news_items[:3]:
    link = item.find('a', attrs={'class':'VDXfz'}).get('href')
    news_link = base_url + link[1:]
    print(news_link)
    
    news_title = item.find('a', attrs={'class':'DY5T1d'}).getText()
    print(news_title)

    news_content = item.find('span', attrs={'class':'xBbh9'}).text
    print(news_content)

    news_agency = item.find('a', attrs={'class':'wEwyrc AVN2gc uQIVzc Sksgp'}).text
    print(news_agency)

    news_reporting = item.find('time', attrs={'class':'WW6dff uQIVzc Sksgp'})
    news_reporting_datetime = news_reporting.get('datetime').split('T')
    news_reporting_date = news_reporting_datetime[0]
    news_reporting_time = news_reporting_datetime[1][:-1]
    print(news_reporting_date, news_reporting_time)    
    print("\n")
    
def google_news_clipping(url, limit=5):
    resp = requests.get(url)
    html_src = resp.text
    soup = BeautifulSoup(html_src, 'html.parser')
    
    news_items = soup.select('div[class="xrnccd"]')
    
    links=[]; titles=[]; contents=[]; agencies=[]; reporting_dates=[]; reporting_times=[];
    
    for item in news_items[:limit]:
        # Get new link
        link = item.find('a', attrs={'class':'VDXfz'}).get('href')
        news_link = base_url + link[1:]
        links.append(news_link)
        
        # Get new title
        news_title = item.find('a', attrs={'class':'DY5T1d'}).getText()
        titles.append(news_title)
    
        # Get new contents
        news_content = item.find('span', attrs={'class':'xBbh9'}).text
        contents.append(news_content)
    
        news_agency = item.find('a', attrs={'class':'wEwyrc AVN2gc uQIVzc Sksgp'}).text
        agencies.append(news_agency)
    
        news_reporting = item.find('time', attrs={'class':'WW6dff uQIVzc Sksgp'})
        news_reporting_datetime = news_reporting.get('datetime').split('T')
        news_reporting_date = news_reporting_datetime[0]
        news_reporting_time = news_reporting_datetime[1][:-1]
        reporting_dates.append(news_reporting_date)
        reporting_times.append(news_reporting_time)     
    # Return dictionary type
    result = {'link':links, 'title':titles, 'contents':contents, 'agency':agencies, \
              'date':reporting_dates, 'time':reporting_times}
    
    return result

news = google_news_clipping(search_url, 2)
print(news)

Leave a Reply

Your email address will not be published. Required fields are marked *