from urllib.parse import urljoin import urllib.request from bs4 import BeautifulSoup import time import os import re import errno def mkdir_p(path): # 递归创建多级目录 try: os.makedirs(path) except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5) if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def get_link(page): # 寻找链接的href linkData = [] for page in page.find_all('td'): links = page.select("a") for each in links: # if str(each.get('href'))[:1] == '/': 过滤if代码 data = each.get('href') linkData.append(data) return (linkData) def gain(url): # 获取网页指定内容 try: page = urllib.request.urlopen(url).read() soup = BeautifulSoup(page, 'lxml') # 利用soup获取网页内容 links = get_link(soup) # 获取
if __name__ == '__main__': main()