Bladeren bron

爬取网站url

master
zeek 3 maanden geleden
bovenliggende
commit
28690a838b
2 gewijzigde bestanden met toevoegingen van 45 en 0 verwijderingen
  1. +44
    -0
      get_url.py
  2. +1
    -0
      requirement.txt

+ 44
- 0
get_url.py Bestand weergeven

@ -0,0 +1,44 @@
#!/usr/bin/env python
# coding=utf-8
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
urlmine = "https://git.zeekling.cn/"
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def getlinks(url):
pages = requests.get(url)
html = pages.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
return filterlinks(links, url)
def filterlinks(links, url_org):
tmplinks = []
for link in links:
url = str(link['href'])
ishttp = url.startswith('http')
ismine = url.startswith(urlmine)
if ishttp and (not ismine):
continue
if url.startswith('#') or '/' == url.strip():
continue
if url.startswith("?"):
continue
if not ishttp:
url = url_org + url
tmplinks.append(url)
reslinks = list(set(tmplinks))
return reslinks
links = getlinks(urlmine)
for link in links:
print(link)

+ 1
- 0
requirement.txt Bestand weergeven

@ -0,0 +1 @@
BeautifulSoup4

Laden…
Annuleren
Opslaan