[python] crawler (naver news)

2021. 5. 1. 18:43Python/코딩

Ver. Jupyter Notebook (Anaconda3)

▶ crawler_naver news

수집: 기사명, 날짜, 추천수, 좋아요…, 댓글 수, 댓글

코딩: github

 

JeongJaeyoung0/crawler

Contribute to JeongJaeyoung0/crawler development by creating an account on GitHub.

github.com

2021.05.08

# crawler_naver news
Step 1. 네이버 > 검색어 입력 > 뉴스> 네이버뉴스 url 수집 > 저장 (naver_news_url.xlsx)
Step 2. naver_news_url.xlsx 불러오기 > 제목, 날짜, 댓글, 좋아요 크롤링 > 저장 (naver_news_keyword.xlsx)

pwd

### Step 0. 준비
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver import ActionChains as AC  # 웹 브라우저 동작
from tqdm import tqdm
from tqdm.notebook import tqdm
import re
from time import sleep
import time

### Step 1. 크롤링할 뉴스들 url 수집
# 검색어
keyword = input("크롤링할 키워드를 입력하세요: ")

# 크롬 웹브라우저 실행
driver = webdriver.Chrome(r"G:\내 드라이브\exe\chromedriver.exe")
driver.get("https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}".format(keyword))
time.sleep(2)

# page1에서 네이버뉴스 url 수집하기
things = driver.find_elements_by_link_text('네이버뉴스')  

url_list = []
for thing in things:
    url = thing.get_attribute('href')
    url_list.append(url)

print(len(url_list))
url_list

# url_list 저장
df = pd.DataFrame({"url":url_list})
df.to_excel('naver_news_urls.xlsx')

### Step 2. 전체 기사 for문으로 데이터 수집
# 저장해둔 url 불러오기
df = pd.read_csv('naver_news_urls.xlsx')
print(len(df['url']))
df['url']

number = len(df['url'])
number

dict = {}

# 페이지당 기사 수집
for i in tqdm(range(0, number)):   # len(df['url'])
    try:
        # 뉴스 크롬창 띄우기
        driver = webdriver.Chrome(r"G:\내 드라이브\exe\chromedriver.exe")
        driver.get(df['url'][i])
        time.sleep(1)

        # 기사 데이터 수집
        title = driver.find_element_by_css_selector('.tts_head').text
        date = driver.find_element_by_css_selector('.t11').text
        up = driver.find_element_by_css_selector('.u_cnt._count').text
        like = driver.find_element_by_css_selector(".end_btn .u_likeit_list.good .u_likeit_list_count._count").text
        warm = driver.find_element_by_css_selector(".end_btn .u_likeit_list.warm .u_likeit_list_count._count").text
        sad = driver.find_element_by_css_selector(".end_btn .u_likeit_list.sad .u_likeit_list_count._count").text
        angry = driver.find_element_by_css_selector(".end_btn .u_likeit_list.angry .u_likeit_list_count._count").text
        want = driver.find_element_by_css_selector(".end_btn .u_likeit_list.want .u_likeit_list_count._count").text

        # 기사 댓글 갯수 
        review_count = int(driver.find_element_by_css_selector(".u_cbox_count").text.replace(',', ''))

        # 더보기 클릭 횟수
        moreview_num = review_count//20

        # 댓글 버튼 클릭
        driver.find_element_by_css_selector(".lo_txt").click()
        time.sleep(1)

        # 더보기 버튼 여러번 클릭하기
        k=0
        while k <= moreview_num:  # 더보기 횟수만큼 반복 
            try:
                driver.find_element_by_css_selector(".u_cbox_page_more").click()  # 더보기 버튼 클릭
                time.sleep(1)
                k = k+1
            except:        
                break   # 에러나면 클릭 반복문을 빠져나가라

        # review 수집하기
        review_list = []
        overlays1 = ".u_cbox_text_wrap"
        reviews = driver.find_elements_by_css_selector(overlays1)
        for review in tqdm(reviews):    
            review = review.text
            review_list.append(review)

        target_info = {}
        target_info['기사명'] = title
        target_info['날짜'] = date
        target_info['추천수'] = up
        target_info['좋아요'] = like
        target_info['훈훈해요'] = warm
        target_info['슬퍼요'] = sad
        target_info['화나요'] = angry
        target_info['후속기사 원해요'] = want
        target_info['댓글 수'] = len(review_list)
        target_info['댓글'] = review_list

        dict[i] = target_info
        
        print(title, '( 댓글 수:', len(review_list),')')

        driver.close()
        time.sleep(1)
        
    except:
        driver.close()
        continue

print(len(dict))
dict

# 판다스화
import pandas as pd
result_df = pd.DataFrame.from_dict(dict, 'index')
result_df

# 엑셀 저장
result_df.to_excel("naver_news_({}).xlsx".format(keyword))