抓取脚本完成

2020-03-22 10:47:14 +08:00 · 2020-03-22 10:47:14 +08:00 · 45385e73cf
commit 45385e73cf
parent 799fa4d1bb
1 changed files with 65 additions and 9 deletions
--- a/get_url.py
+++ b/get_url.py
@ -4,41 +4,97 @@
 import requests
 from bs4 import BeautifulSoup
-urlmine = "https://git.zeekling.cn/"
+
 url_root = 'https://git.zeekling.cn'
 url_mine_list = [
    'https://git.zeekling.cn',
    'https://git.zeekling.cn/zeekling'
 ]
 headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 }
 url_res_final = []
 max_depth = 2
 url_robot_arr = [
    '/user/sign_up',
    '/user/login',
    '/user/forgot_password'
 ]
 url_static_arr = [
    '.js',
    '.css',
    '.cscc'
 ]
 def is_static(url):
    url = str(url)
    for static in url_static_arr:
        if url.endswith(static):
            return True
    return False
 def is_robot(url):
    url = str(url)
    for robot in url_robot_arr:
        if url.startswith(robot):
            return True
    return False
 def getlinks(url):
    pages = requests.get(url)
    html = pages.text
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
-    return filterlinks(links, url)
+    return filterlinks(links)
-def filterlinks(links, url_org):
+def filterlinks(links):
    tmplinks = []
    for link in links:
-        url = str(link['href'])
+        url = str(link.get('href'))
        if url is None or url == '':
            continue
        ishttp = url.startswith('http')
-        ismine = url.startswith(urlmine)
+        ismine = url.startswith(url_root)
        if ishttp and (not ismine):
            continue
        if url.startswith('#') or '/' == url.strip():
            continue
        if url.startswith("?"):
            continue
        if is_static(url):
            continue
        if is_robot(url):
            continue
        if not ishttp:
-            url = url_org + url
+            url = url_root + url
        tmplinks.append(url)
    reslinks = list(set(tmplinks))
    return reslinks
 links = getlinks(urlmine)
-for link in links:
+def parser(url_arr, depth):
-    print(link)
+    url_tmp = []
    if(depth >= max_depth):
        return
    depth += 1
    print('parsing depth:', depth, ' parse urls:', len(url_arr))
    for urlmine in url_arr:
        links = getlinks(urlmine)
        url_tmp.extend(links)
    url_tmp = list(set(url_tmp).difference(set(url_res_final)))
    url_res_final.extend(url_tmp)
    parser(url_tmp, depth)
 parser(url_mine_list, 0)
 print('parser result size:', len(url_res_final))
 for url in url_res_final:
    print(url)