抓取脚本完成

This commit is contained in:
zeek 2020-03-22 10:47:14 +08:00
parent 799fa4d1bb
commit 45385e73cf

View File

@ -4,41 +4,97 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
urlmine = "https://git.zeekling.cn/"
url_root = 'https://git.zeekling.cn'
url_mine_list = [
'https://git.zeekling.cn',
'https://git.zeekling.cn/zeekling'
]
headers = { headers = {
'User-Agent': 'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
} }
url_res_final = []
max_depth = 2
url_robot_arr = [
'/user/sign_up',
'/user/login',
'/user/forgot_password'
]
url_static_arr = [
'.js',
'.css',
'.cscc'
]
def is_static(url):
url = str(url)
for static in url_static_arr:
if url.endswith(static):
return True
return False
def is_robot(url):
url = str(url)
for robot in url_robot_arr:
if url.startswith(robot):
return True
return False
def getlinks(url): def getlinks(url):
pages = requests.get(url) pages = requests.get(url)
html = pages.text html = pages.text
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a') links = soup.find_all('a')
return filterlinks(links, url) return filterlinks(links)
def filterlinks(links, url_org): def filterlinks(links):
tmplinks = [] tmplinks = []
for link in links: for link in links:
url = str(link['href']) url = str(link.get('href'))
if url is None or url == '':
continue
ishttp = url.startswith('http') ishttp = url.startswith('http')
ismine = url.startswith(urlmine) ismine = url.startswith(url_root)
if ishttp and (not ismine): if ishttp and (not ismine):
continue continue
if url.startswith('#') or '/' == url.strip(): if url.startswith('#') or '/' == url.strip():
continue continue
if url.startswith("?"): if url.startswith("?"):
continue continue
if is_static(url):
continue
if is_robot(url):
continue
if not ishttp: if not ishttp:
url = url_org + url url = url_root + url
tmplinks.append(url) tmplinks.append(url)
reslinks = list(set(tmplinks)) reslinks = list(set(tmplinks))
return reslinks return reslinks
links = getlinks(urlmine)
for link in links: def parser(url_arr, depth):
print(link) url_tmp = []
if(depth >= max_depth):
return
depth += 1
print('parsing depth:', depth, ' parse urls:', len(url_arr))
for urlmine in url_arr:
links = getlinks(urlmine)
url_tmp.extend(links)
url_tmp = list(set(url_tmp).difference(set(url_res_final)))
url_res_final.extend(url_tmp)
parser(url_tmp, depth)
parser(url_mine_list, 0)
print('parser result size:', len(url_res_final))
for url in url_res_final:
print(url)