爬取网站url
This commit is contained in:
parent
e1ab9b83b7
commit
28690a838b
44
get_url.py
Executable file
44
get_url.py
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
urlmine = "https://git.zeekling.cn/"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def getlinks(url):
|
||||||
|
pages = requests.get(url)
|
||||||
|
html = pages.text
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
links = soup.find_all('a')
|
||||||
|
return filterlinks(links, url)
|
||||||
|
|
||||||
|
|
||||||
|
def filterlinks(links, url_org):
|
||||||
|
tmplinks = []
|
||||||
|
for link in links:
|
||||||
|
url = str(link['href'])
|
||||||
|
ishttp = url.startswith('http')
|
||||||
|
ismine = url.startswith(urlmine)
|
||||||
|
if ishttp and (not ismine):
|
||||||
|
continue
|
||||||
|
if url.startswith('#') or '/' == url.strip():
|
||||||
|
continue
|
||||||
|
if url.startswith("?"):
|
||||||
|
continue
|
||||||
|
if not ishttp:
|
||||||
|
url = url_org + url
|
||||||
|
tmplinks.append(url)
|
||||||
|
reslinks = list(set(tmplinks))
|
||||||
|
return reslinks
|
||||||
|
|
||||||
|
links = getlinks(urlmine)
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
print(link)
|
1
requirement.txt
Normal file
1
requirement.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
BeautifulSoup4
|
Loading…
Reference in New Issue
Block a user