import requests from bs4 import BeautifulSoup from urllib.request import HTTPError import pymysql import datetime headers = {'User=Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'} def blog_crawling(keywords): start_time = datetime.datetime.now() conn = pymysql.connect(host=xxxxxx', user='xxxx', password='xxxxx', db='xxxx', charset='utf8') try: with conn.cursor() as cursor: for keyword in keywords: count = 1 for x in range(1,1000,10): now = datetime.datetime.now() print(x) url = 'https://search.naver.com/search.naver?date_from=&date_option=0&date_to=&dup_remove=1&nso=&post_blogurl=&post_blogurl_without=&query={0}&sm=tab_pge&srchby=all&st=sim&where=post&start={1}'.format(keyword,x) response = requests.get(url,headers = headers) html = BeautifulSoup(response.text,'html.parser') bloghtmls = html.find_all('li',class_='sh_blog_top') for bloghtml in bloghtmls: print(count) i = 0 if 'title' in bloghtml.select('a')[1].attrs: print(bloghtml.select('a')[1]['title']) # 블로그 타이틀 -- a 태그중 2번째의 태그의 title 가 블로그 제목이다 i = 1 elif 'title' in bloghtml.select('a')[2].attrs: print(bloghtml.select('a')[2]['title']) # 블로그 타이틀 -- a 태그중 2번째의 태그의 title 가 블로그 제목이다 i = 2 #print(bloghtml.select('a')[2]['title']) #블로그 타이틀 -- a 태그중 3번째의 태그의 title 가 블로그 제목이다 print(bloghtml.select('a')[0]['href']) #글 URL -- a태그가 여러개라면 그중 첫번째 중에 href를 가져와라 print(bloghtml.select('.txt84')[0].get_text()) #블로그명 print(bloghtml.select('.txt_inline')[0].get_text()) #등록일자 print('\n') sql = """insert into naver_blog2(search_date, keyword, title, link,rank_b,write_date) values (%s, %s, %s, %s,%s,%s)""" cursor.execute(sql, (now,keyword,bloghtml.select('a')[i]['title'],bloghtml.select('a')[0]['href'],count,bloghtml.select('.txt_inline')[0].get_text())) conn.commit() count += 1 finally: conn.close() end_time = datetime.datetime.now() run_time = end_time - start_time print(run_time) blog_crawling(['python','c++','java'])