2019. 9. 22.

[python] 홈페이지 전체 페이지 크롤링

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()

def linkUrl(addrUrl) :
html = urlopen("https://en.m.wikipedia.org/wiki/Main_Page")
bsObj = BeautifulSoup(html.read(), "html.parser")
for i in bsObj.findAll("a",href=re.compile("(^/wiki/)")) :
if "href" in i.attrs :
if i.attrs["href"] not in pages :
newPages = i.attrs["href"]
pages.add(newPages)
print(newPages)
linkUrl(newPages)
linkUrl("")

Popular Posts

Recent Posts

Powered by Blogger.