from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import pymysql
conn = pymysql.connect(host='127.0.0.1',user='root',passwd='2kdskfk1!@', db='scraping', charset='utf8')
cur = conn.cursor()
cur.execute("USE scraping")
random.seed(datetime.datetime.now())
def store(title, content):
cur.execute(
"INSERT INTO pages (title, content) VALUES (\"%s\", \"%s\")", (title, content)
)
cur.connection.commit()
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html, "html.parser")
title = bsObj.find("h1").get_text()
content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()
store(title, content)
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks('/wiki/Kevin_Bacon')
try :
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()
기본적인 테스트기 때문에 불필요한 값들이 많이 들어가있는 것을 볼 수 있다.