diff --git a/get_url.py b/get_url.py new file mode 100755 index 0000000..51c5454 --- /dev/null +++ b/get_url.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# coding=utf-8 +# -*- coding: utf-8 -*- +import requests +from bs4 import BeautifulSoup + +urlmine = "https://git.zeekling.cn/" + +headers = { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' +} + + +def getlinks(url): + pages = requests.get(url) + html = pages.text + soup = BeautifulSoup(html, 'html.parser') + links = soup.find_all('a') + return filterlinks(links, url) + + +def filterlinks(links, url_org): + tmplinks = [] + for link in links: + url = str(link['href']) + ishttp = url.startswith('http') + ismine = url.startswith(urlmine) + if ishttp and (not ismine): + continue + if url.startswith('#') or '/' == url.strip(): + continue + if url.startswith("?"): + continue + if not ishttp: + url = url_org + url + tmplinks.append(url) + reslinks = list(set(tmplinks)) + return reslinks + +links = getlinks(urlmine) + +for link in links: + print(link) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..91bbd0f --- /dev/null +++ b/requirement.txt @@ -0,0 +1 @@ +BeautifulSoup4