2020-03-21 14:23:46 +00:00
|
|
|
#!/usr/bin/env python3
|
2020-03-21 14:09:14 +00:00
|
|
|
# coding=utf-8
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2020-03-22 02:47:14 +00:00
|
|
|
|
|
|
|
url_root = 'https://git.zeekling.cn'
|
|
|
|
url_mine_list = [
|
2020-03-22 03:56:38 +00:00
|
|
|
'https://git.zeekling.cn/',
|
2020-03-22 05:51:59 +00:00
|
|
|
'https://git.zeekling.cn/zeekling',
|
|
|
|
'https://git.zeekling.cn/deep-learn',
|
|
|
|
'https://git.zeekling.cn/java',
|
|
|
|
'https://git.zeekling.cn/python',
|
|
|
|
'https://git.zeekling.cn/linux',
|
|
|
|
'https://git.zeekling.cn/mirror'
|
2020-03-22 02:47:14 +00:00
|
|
|
]
|
|
|
|
|
2020-03-21 14:09:14 +00:00
|
|
|
|
|
|
|
headers = {
|
|
|
|
'User-Agent':
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
|
|
|
|
}
|
|
|
|
|
2020-03-22 02:47:14 +00:00
|
|
|
url_res_final = []
|
|
|
|
max_depth = 2
|
|
|
|
url_robot_arr = [
|
|
|
|
'/user/sign_up',
|
|
|
|
'/user/login',
|
|
|
|
'/user/forgot_password'
|
|
|
|
]
|
|
|
|
url_static_arr = [
|
|
|
|
'.js',
|
|
|
|
'.css',
|
2020-03-22 03:56:38 +00:00
|
|
|
'.cscc',
|
2020-03-22 04:37:54 +00:00
|
|
|
'None',
|
|
|
|
'about:blank'
|
2020-03-22 02:47:14 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def is_static(url):
|
|
|
|
url = str(url)
|
|
|
|
for static in url_static_arr:
|
|
|
|
if url.endswith(static):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def is_robot(url):
|
|
|
|
url = str(url)
|
|
|
|
for robot in url_robot_arr:
|
|
|
|
if url.startswith(robot):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2020-03-21 14:09:14 +00:00
|
|
|
|
|
|
|
def getlinks(url):
|
|
|
|
pages = requests.get(url)
|
|
|
|
html = pages.text
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
links = soup.find_all('a')
|
2020-03-22 02:47:14 +00:00
|
|
|
return filterlinks(links)
|
2020-03-21 14:09:14 +00:00
|
|
|
|
|
|
|
|
2020-03-22 02:47:14 +00:00
|
|
|
def filterlinks(links):
|
2020-03-21 14:09:14 +00:00
|
|
|
tmplinks = []
|
|
|
|
for link in links:
|
2020-03-22 02:47:14 +00:00
|
|
|
url = str(link.get('href'))
|
|
|
|
if url is None or url == '':
|
|
|
|
continue
|
2020-03-21 14:09:14 +00:00
|
|
|
ishttp = url.startswith('http')
|
2020-03-22 02:47:14 +00:00
|
|
|
ismine = url.startswith(url_root)
|
2020-03-21 14:09:14 +00:00
|
|
|
if ishttp and (not ismine):
|
|
|
|
continue
|
|
|
|
if url.startswith('#') or '/' == url.strip():
|
|
|
|
continue
|
|
|
|
if url.startswith("?"):
|
|
|
|
continue
|
2020-03-22 02:47:14 +00:00
|
|
|
if is_static(url):
|
|
|
|
continue
|
|
|
|
if is_robot(url):
|
|
|
|
continue
|
2020-03-22 06:13:23 +00:00
|
|
|
if not ishttp and url.startswith("/"):
|
|
|
|
url = url_root + url
|
|
|
|
elif not ishttp:
|
|
|
|
url = url_root + '/' + url
|
2020-03-21 14:09:14 +00:00
|
|
|
tmplinks.append(url)
|
|
|
|
reslinks = list(set(tmplinks))
|
|
|
|
return reslinks
|
|
|
|
|
|
|
|
|
2020-03-22 02:47:14 +00:00
|
|
|
def parser(url_arr, depth):
|
|
|
|
url_tmp = []
|
|
|
|
if(depth >= max_depth):
|
|
|
|
return
|
|
|
|
depth += 1
|
|
|
|
print('parsing depth:', depth, ' parse urls:', len(url_arr))
|
|
|
|
for urlmine in url_arr:
|
|
|
|
links = getlinks(urlmine)
|
|
|
|
url_tmp.extend(links)
|
|
|
|
url_tmp = list(set(url_tmp).difference(set(url_res_final)))
|
|
|
|
url_res_final.extend(url_tmp)
|
|
|
|
parser(url_tmp, depth)
|
|
|
|
|
|
|
|
|
|
|
|
parser(url_mine_list, 0)
|
2020-03-22 02:48:32 +00:00
|
|
|
# print('parser result size:', len(url_res_final))
|
|
|
|
# for url in url_res_final:
|
|
|
|
# print(url)
|