sitemap/get_url.py

#!/usr/bin/env python3
# coding=utf-8
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup


url_root = 'https://git.zeekling.cn'
url_mine_list = [
    'https://git.zeekling.cn/',
    'https://git.zeekling.cn/zeekling',
    'https://git.zeekling.cn/deep-learn',
    'https://git.zeekling.cn/java',
    'https://git.zeekling.cn/python',
    'https://git.zeekling.cn/linux',
    'https://git.zeekling.cn/mirror'
]


headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

url_res_final = []
max_depth = 2
url_robot_arr = [
    '/user/sign_up',
    '/user/login',
    '/user/forgot_password'
]
url_static_arr = [
    '.js',
    '.css',
    '.cscc',
    'None',
    'about:blank'
]


def is_static(url):
    url = str(url)
    for static in url_static_arr:
        if url.endswith(static):
            return True
    return False


def is_robot(url):
    url = str(url)
    for robot in url_robot_arr:
        if url.startswith(robot):
            return True
    return False


def getlinks(url):
    pages = requests.get(url)
    html = pages.text
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
    return filterlinks(links)


def filterlinks(links):
    tmplinks = []
    for link in links:
        url = str(link.get('href'))
        if url is None or url == '':
            continue
        ishttp = url.startswith('http')
        ismine = url.startswith(url_root)
        if ishttp and (not ismine):
            continue
        if url.startswith('#') or '/' == url.strip():
            continue
        if url.startswith("?"):
            continue
        if is_static(url):
            continue
        if is_robot(url):
            continue
        if not ishttp and url.startswith("/"):
            url = url_root  + url
        elif not ishttp:
            url = url_root + '/'  + url
        tmplinks.append(url)
    reslinks = list(set(tmplinks))
    return reslinks


def parser(url_arr, depth):
    url_tmp = []
    if(depth >= max_depth):
        return
    depth += 1
    print('parsing depth:', depth, ' parse urls:', len(url_arr))
    for urlmine in url_arr:
        links = getlinks(urlmine)
        url_tmp.extend(links)
    url_tmp = list(set(url_tmp).difference(set(url_res_final)))
    url_res_final.extend(url_tmp)
    parser(url_tmp, depth)


parser(url_mine_list, 0)
# print('parser result size:', len(url_res_final))
# for url in url_res_final:
#     print(url)
更新readme 2020-03-21 14:23:46 +00:00			`#!/usr/bin/env python3`
爬取网站url 2020-03-21 14:09:14 +00:00			`# coding=utf-8`
			`# -- coding: utf-8 --`
			`import requests`
			`from bs4 import BeautifulSoup`

抓取脚本完成 2020-03-22 02:47:14 +00:00
			`url_root = 'https://git.zeekling.cn'`
			`url_mine_list = [`
生成sitemap.xml 2020-03-22 03:56:38 +00:00			`'https://git.zeekling.cn/',`
修改get_url 2020-03-22 05:51:59 +00:00			`'https://git.zeekling.cn/zeekling',`
			`'https://git.zeekling.cn/deep-learn',`
			`'https://git.zeekling.cn/java',`
			`'https://git.zeekling.cn/python',`
			`'https://git.zeekling.cn/linux',`
			`'https://git.zeekling.cn/mirror'`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`]`

爬取网站url 2020-03-21 14:09:14 +00:00
			`headers = {`
			`'User-Agent':`
			`'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'`
			`}`

抓取脚本完成 2020-03-22 02:47:14 +00:00			`url_res_final = []`
			`max_depth = 2`
			`url_robot_arr = [`
			`'/user/sign_up',`
			`'/user/login',`
			`'/user/forgot_password'`
			`]`
			`url_static_arr = [`
			`'.js',`
			`'.css',`
生成sitemap.xml 2020-03-22 03:56:38 +00:00			`'.cscc',`
增加about:blank 2020-03-22 04:37:54 +00:00			`'None',`
			`'about:blank'`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`]`


			`def is_static(url):`
			`url = str(url)`
			`for static in url_static_arr:`
			`if url.endswith(static):`
			`return True`
			`return False`


			`def is_robot(url):`
			`url = str(url)`
			`for robot in url_robot_arr:`
			`if url.startswith(robot):`
			`return True`
			`return False`

爬取网站url 2020-03-21 14:09:14 +00:00
			`def getlinks(url):`
			`pages = requests.get(url)`
			`html = pages.text`
			`soup = BeautifulSoup(html, 'html.parser')`
			`links = soup.find_all('a')`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`return filterlinks(links)`
爬取网站url 2020-03-21 14:09:14 +00:00

抓取脚本完成 2020-03-22 02:47:14 +00:00			`def filterlinks(links):`
爬取网站url 2020-03-21 14:09:14 +00:00			`tmplinks = []`
			`for link in links:`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`url = str(link.get('href'))`
			`if url is None or url == '':`
			`continue`
爬取网站url 2020-03-21 14:09:14 +00:00			`ishttp = url.startswith('http')`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`ismine = url.startswith(url_root)`
爬取网站url 2020-03-21 14:09:14 +00:00			`if ishttp and (not ismine):`
			`continue`
			`if url.startswith('#') or '/' == url.strip():`
			`continue`
			`if url.startswith("?"):`
			`continue`
抓取脚本完成 2020-03-22 02:47:14 +00:00			`if is_static(url):`
			`continue`
			`if is_robot(url):`
			`continue`
修改get_url 2020-03-22 06:13:23 +00:00			`if not ishttp and url.startswith("/"):`
			`url = url_root + url`
			`elif not ishttp:`
			`url = url_root + '/' + url`
爬取网站url 2020-03-21 14:09:14 +00:00			`tmplinks.append(url)`
			`reslinks = list(set(tmplinks))`
			`return reslinks`


抓取脚本完成 2020-03-22 02:47:14 +00:00			`def parser(url_arr, depth):`
			`url_tmp = []`
			`if(depth >= max_depth):`
			`return`
			`depth += 1`
			`print('parsing depth:', depth, ' parse urls:', len(url_arr))`
			`for urlmine in url_arr:`
			`links = getlinks(urlmine)`
			`url_tmp.extend(links)`
			`url_tmp = list(set(url_tmp).difference(set(url_res_final)))`
			`url_res_final.extend(url_tmp)`
			`parser(url_tmp, depth)`


			`parser(url_mine_list, 0)`
抓取脚本完成 2020-03-22 02:48:32 +00:00			`# print('parser result size:', len(url_res_final))`
			`# for url in url_res_final:`
			`# print(url)`