[python] 홈페이지 전체 페이지 크롤링 ~ 정보보안(Information Security) 기록 저장소

2019. 9. 22.

[python] 홈페이지 전체 페이지 크롤링

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

pages = set()

def linkUrl(addrUrl) :

html = urlopen("https://en.m.wikipedia.org/wiki/Main_Page")

bsObj = BeautifulSoup(html.read(), "html.parser")

for i in bsObj.findAll("a",href=re.compile("(^/wiki/)")) :

if "href" in i.attrs :

if i.attrs["href"] not in pages :

newPages = i.attrs["href"]

pages.add(newPages)

print(newPages)

linkUrl(newPages)

linkUrl("")