diff --git a/get_url.py b/get_url.py index 55f0831..31a11a3 100755 --- a/get_url.py +++ b/get_url.py @@ -4,41 +4,97 @@ import requests from bs4 import BeautifulSoup -urlmine = "https://git.zeekling.cn/" + +url_root = 'https://git.zeekling.cn' +url_mine_list = [ + 'https://git.zeekling.cn', + 'https://git.zeekling.cn/zeekling' +] + headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } +url_res_final = [] +max_depth = 2 +url_robot_arr = [ + '/user/sign_up', + '/user/login', + '/user/forgot_password' +] +url_static_arr = [ + '.js', + '.css', + '.cscc' +] + + +def is_static(url): + url = str(url) + for static in url_static_arr: + if url.endswith(static): + return True + return False + + +def is_robot(url): + url = str(url) + for robot in url_robot_arr: + if url.startswith(robot): + return True + return False + def getlinks(url): pages = requests.get(url) html = pages.text soup = BeautifulSoup(html, 'html.parser') links = soup.find_all('a') - return filterlinks(links, url) + return filterlinks(links) -def filterlinks(links, url_org): +def filterlinks(links): tmplinks = [] for link in links: - url = str(link['href']) + url = str(link.get('href')) + if url is None or url == '': + continue ishttp = url.startswith('http') - ismine = url.startswith(urlmine) + ismine = url.startswith(url_root) if ishttp and (not ismine): continue if url.startswith('#') or '/' == url.strip(): continue if url.startswith("?"): continue + if is_static(url): + continue + if is_robot(url): + continue if not ishttp: - url = url_org + url + url = url_root + url tmplinks.append(url) reslinks = list(set(tmplinks)) return reslinks -links = getlinks(urlmine) -for link in links: - print(link) +def parser(url_arr, depth): + url_tmp = [] + if(depth >= max_depth): + return + depth += 1 + print('parsing depth:', depth, ' parse urls:', len(url_arr)) + for urlmine in url_arr: + links = getlinks(urlmine) + url_tmp.extend(links) + url_tmp = list(set(url_tmp).difference(set(url_res_final))) + url_res_final.extend(url_tmp) + parser(url_tmp, depth) + + +parser(url_mine_list, 0) +print('parser result size:', len(url_res_final)) +for url in url_res_final: + print(url)