From 45385e73cfad3241f519a7c776f850f714c12dc5 Mon Sep 17 00:00:00 2001
From: zeek <984294471@qq.com>
Date: Sun, 22 Mar 2020 10:47:14 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8A=93=E5=8F=96=E8=84=9A=E6=9C=AC=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 get_url.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/get_url.py b/get_url.py
index 55f0831..31a11a3 100755
--- a/get_url.py
+++ b/get_url.py
@@ -4,41 +4,97 @@
 import requests
 from bs4 import BeautifulSoup
 
-urlmine = "https://git.zeekling.cn/"
+
+url_root = 'https://git.zeekling.cn'
+url_mine_list = [
+    'https://git.zeekling.cn',
+    'https://git.zeekling.cn/zeekling'
+]
+
 
 headers = {
     'User-Agent':
     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 }
 
+url_res_final = []
+max_depth = 2
+url_robot_arr = [
+    '/user/sign_up',
+    '/user/login',
+    '/user/forgot_password'
+]
+url_static_arr = [
+    '.js',
+    '.css',
+    '.cscc'
+]
+
+
+def is_static(url):
+    url = str(url)
+    for static in url_static_arr:
+        if url.endswith(static):
+            return True
+    return False
+
+
+def is_robot(url):
+    url = str(url)
+    for robot in url_robot_arr:
+        if url.startswith(robot):
+            return True
+    return False
+
 
 def getlinks(url):
     pages = requests.get(url)
     html = pages.text
     soup = BeautifulSoup(html, 'html.parser')
     links = soup.find_all('a')
-    return filterlinks(links, url)
+    return filterlinks(links)
 
 
-def filterlinks(links, url_org):
+def filterlinks(links):
     tmplinks = []
     for link in links:
-        url = str(link['href'])
+        url = str(link.get('href'))
+        if url is None or url == '':
+            continue
         ishttp = url.startswith('http')
-        ismine = url.startswith(urlmine)
+        ismine = url.startswith(url_root)
         if ishttp and (not ismine):
             continue
         if url.startswith('#') or '/' == url.strip():
             continue
         if url.startswith("?"):
             continue
+        if is_static(url):
+            continue
+        if is_robot(url):
+            continue
         if not ishttp:
-            url = url_org + url
+            url = url_root + url
         tmplinks.append(url)
     reslinks = list(set(tmplinks))
     return reslinks
 
-links = getlinks(urlmine)
 
-for link in links:
-    print(link)
+def parser(url_arr, depth):
+    url_tmp = []
+    if(depth >= max_depth):
+        return
+    depth += 1
+    print('parsing depth:', depth, ' parse urls:', len(url_arr))
+    for urlmine in url_arr:
+        links = getlinks(urlmine)
+        url_tmp.extend(links)
+    url_tmp = list(set(url_tmp).difference(set(url_res_final)))
+    url_res_final.extend(url_tmp)
+    parser(url_tmp, depth)
+
+
+parser(url_mine_list, 0)
+print('parser result size:', len(url_res_final))
+for url in url_res_final:
+    print(url)