通过爬取网站,生成sitemap.xml
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

44 lines
1.0 KiB

#!/usr/bin/env python
# coding=utf-8
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
urlmine = "https://git.zeekling.cn/"
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def getlinks(url):
    pages = requests.get(url)
    html = pages.text
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
    return filterlinks(links, url)
def filterlinks(links, url_org):
    tmplinks = []
    for link in links:
        url = str(link['href'])
        ishttp = url.startswith('http')
        ismine = url.startswith(urlmine)
        if ishttp and (not ismine):
            continue
        if url.startswith('#') or '/' == url.strip():
            continue
        if url.startswith("?"):
            continue
        if not ishttp:
            url = url_org + url
        tmplinks.append(url)
    reslinks = list(set(tmplinks))
    return reslinks
links = getlinks(urlmine)
for link in links:
    print(link)