2019. 10. 11.

[python] 네이버 검색 정보 크롤링해서 DB에 넣기

import requests
from bs4 import BeautifulSoup
from urllib.request import HTTPError
import pymysql
import datetime
headers = {'User=Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}

def blog_crawling(keywords):
    start_time = datetime.datetime.now()

    conn = pymysql.connect(host=xxxxxx', user='xxxx', password='xxxxx', db='xxxx', charset='utf8')
    try:
        with conn.cursor() as cursor:
            for keyword in keywords:
                count = 1                for x in range(1,1000,10):
                    now = datetime.datetime.now()
                    print(x)

                    url = 'https://search.naver.com/search.naver?date_from=&date_option=0&date_to=&dup_remove=1&nso=&post_blogurl=&post_blogurl_without=&query={0}&sm=tab_pge&srchby=all&st=sim&where=post&start={1}'.format(keyword,x)
                    response = requests.get(url,headers = headers)
                    html = BeautifulSoup(response.text,'html.parser')
                    bloghtmls = html.find_all('li',class_='sh_blog_top')
                    for bloghtml in bloghtmls:
                        print(count)
                        i = 0                        if 'title' in bloghtml.select('a')[1].attrs:
                            print(bloghtml.select('a')[1]['title'])  # 블로그 타이틀 -- a 태그중 2번째의 태그의  title 가 블로그 제목이다                            i = 1                        elif 'title' in bloghtml.select('a')[2].attrs:
                            print(bloghtml.select('a')[2]['title']) # 블로그 타이틀 -- a 태그중 2번째의 태그의  title 가 블로그 제목이다                            i = 2                        #print(bloghtml.select('a')[2]['title']) #블로그 타이틀 -- a 태그중 3번째의 태그의  title 가 블로그 제목이다                        print(bloghtml.select('a')[0]['href']) # URL -- a태그가 여러개라면 그중 첫번째 중에 href를 가져와라                        print(bloghtml.select('.txt84')[0].get_text()) #블로그명                        print(bloghtml.select('.txt_inline')[0].get_text()) #등록일자                        print('\n')

                        sql = """insert into naver_blog2(search_date, keyword, title, link,rank_b,write_date) values (%s, %s, %s, %s,%s,%s)"""                        cursor.execute(sql, (now,keyword,bloghtml.select('a')[i]['title'],bloghtml.select('a')[0]['href'],count,bloghtml.select('.txt_inline')[0].get_text()))
                        conn.commit()

                        count += 1

    finally:
        conn.close()
        end_time = datetime.datetime.now()
        run_time = end_time - start_time
        print(run_time)

blog_crawling(['python','c++','java'])