From 28690a838b2e2c2f9df12009c70f556648e545a7 Mon Sep 17 00:00:00 2001 From: zeek <984294471@qq.com> Date: Sat, 21 Mar 2020 22:09:14 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E5=8F=96=E7=BD=91=E7=AB=99url?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- get_url.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ requirement.txt | 1 + 2 files changed, 45 insertions(+) create mode 100755 get_url.py create mode 100644 requirement.txt diff --git a/get_url.py b/get_url.py new file mode 100755 index 0000000..51c5454 --- /dev/null +++ b/get_url.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# coding=utf-8 +# -*- coding: utf-8 -*- +import requests +from bs4 import BeautifulSoup + +urlmine = "https://git.zeekling.cn/" + +headers = { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' +} + + +def getlinks(url): + pages = requests.get(url) + html = pages.text + soup = BeautifulSoup(html, 'html.parser') + links = soup.find_all('a') + return filterlinks(links, url) + + +def filterlinks(links, url_org): + tmplinks = [] + for link in links: + url = str(link['href']) + ishttp = url.startswith('http') + ismine = url.startswith(urlmine) + if ishttp and (not ismine): + continue + if url.startswith('#') or '/' == url.strip(): + continue + if url.startswith("?"): + continue + if not ishttp: + url = url_org + url + tmplinks.append(url) + reslinks = list(set(tmplinks)) + return reslinks + +links = getlinks(urlmine) + +for link in links: + print(link) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..91bbd0f --- /dev/null +++ b/requirement.txt @@ -0,0 +1 @@ +BeautifulSoup4