#!/usr/bin/env python3 # coding=utf-8 # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup url_root = 'https://git.zeekling.cn' url_mine_list = [ 'https://git.zeekling.cn/', 'https://git.zeekling.cn/zeekling', 'https://git.zeekling.cn/deep-learn', 'https://git.zeekling.cn/java', 'https://git.zeekling.cn/python', 'https://git.zeekling.cn/linux', 'https://git.zeekling.cn/mirror' ] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } url_res_final = [] max_depth = 2 url_robot_arr = [ '/user/sign_up', '/user/login', '/user/forgot_password' ] url_static_arr = [ '.js', '.css', '.cscc', 'None', 'about:blank' ] def is_static(url): url = str(url) for static in url_static_arr: if url.endswith(static): return True return False def is_robot(url): url = str(url) for robot in url_robot_arr: if url.startswith(robot): return True return False def getlinks(url): pages = requests.get(url) html = pages.text soup = BeautifulSoup(html, 'html.parser') links = soup.find_all('a') return filterlinks(links) def filterlinks(links): tmplinks = [] for link in links: url = str(link.get('href')) if url is None or url == '': continue ishttp = url.startswith('http') ismine = url.startswith(url_root) if ishttp and (not ismine): continue if url.startswith('#') or '/' == url.strip(): continue if url.startswith("?"): continue if is_static(url): continue if is_robot(url): continue if not ishttp and url.startswith("/"): url = url_root + url elif not ishttp: url = url_root + '/' + url tmplinks.append(url) reslinks = list(set(tmplinks)) return reslinks def parser(url_arr, depth): url_tmp = [] if(depth >= max_depth): return depth += 1 print('parsing depth:', depth, ' parse urls:', len(url_arr)) for urlmine in url_arr: links = getlinks(urlmine) url_tmp.extend(links) url_tmp = list(set(url_tmp).difference(set(url_res_final))) url_res_final.extend(url_tmp) parser(url_tmp, depth) parser(url_mine_list, 0) # print('parser result size:', len(url_res_final)) # for url in url_res_final: # print(url)