爬取网站url

2020-03-21 22:09:14 +08:00 · 2020-03-21 22:09:14 +08:00 · 28690a838b
commit 28690a838b
parent e1ab9b83b7
2 changed files with 45 additions and 0 deletions
--- a/get_url.py
+++ b/get_url.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# coding=utf-8
+# -*- coding: utf-8 -*-
+import requests
+from bs4 import BeautifulSoup
+
+urlmine = "https://git.zeekling.cn/"
+
+headers = {
+    'User-Agent':
+    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
+}
+
+
+def getlinks(url):
+    pages = requests.get(url)
+    html = pages.text
+    soup = BeautifulSoup(html, 'html.parser')
+    links = soup.find_all('a')
+    return filterlinks(links, url)
+
+
+def filterlinks(links, url_org):
+    tmplinks = []
+    for link in links:
+        url = str(link['href'])
+        ishttp = url.startswith('http')
+        ismine = url.startswith(urlmine)
+        if ishttp and (not ismine):
+            continue
+        if url.startswith('#') or '/' == url.strip():
+            continue
+        if url.startswith("?"):
+            continue
+        if not ishttp:
+            url = url_org + url
+        tmplinks.append(url)
+    reslinks = list(set(tmplinks))
+    return reslinks
+
+links = getlinks(urlmine)
+
+for link in links:
+    print(link)
--- a/requirement.txt
+++ b/requirement.txt
@ -0,0 +1 @@
+BeautifulSoup4