爬取网站url

2020-03-21 22:09:14 +08:00 · 2020-03-21 22:09:14 +08:00 · 28690a838b
commit 28690a838b
parent e1ab9b83b7
2 changed files with 45 additions and 0 deletions
--- a/get_url.py
+++ b/get_url.py
@ -0,0 +1,44 @@
 #!/usr/bin/env python
 # coding=utf-8
 # -*- coding: utf-8 -*-
 import requests
 from bs4 import BeautifulSoup
 urlmine = "https://git.zeekling.cn/"
 headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 }
 def getlinks(url):
    pages = requests.get(url)
    html = pages.text
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
    return filterlinks(links, url)
 def filterlinks(links, url_org):
    tmplinks = []
    for link in links:
        url = str(link['href'])
        ishttp = url.startswith('http')
        ismine = url.startswith(urlmine)
        if ishttp and (not ismine):
            continue
        if url.startswith('#') or '/' == url.strip():
            continue
        if url.startswith("?"):
            continue
        if not ishttp:
            url = url_org + url
        tmplinks.append(url)
    reslinks = list(set(tmplinks))
    return reslinks
 links = getlinks(urlmine)
 for link in links:
    print(link)
--- a/requirement.txt
+++ b/requirement.txt
@ -0,0 +1 @@
 BeautifulSoup4