通过爬取网站,生成sitemap.xml
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

110 lines
2.5 KiB

  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. # -*- coding: utf-8 -*-
  4. import requests
  5. from bs4 import BeautifulSoup
  6. url_root = 'https://git.zeekling.cn'
  7. url_mine_list = [
  8. 'https://git.zeekling.cn/',
  9. 'https://git.zeekling.cn/zeekling',
  10. 'https://git.zeekling.cn/deep-learn',
  11. 'https://git.zeekling.cn/java',
  12. 'https://git.zeekling.cn/python',
  13. 'https://git.zeekling.cn/linux',
  14. 'https://git.zeekling.cn/mirror'
  15. ]
  16. headers = {
  17. 'User-Agent':
  18. 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
  19. }
  20. url_res_final = []
  21. max_depth = 2
  22. url_robot_arr = [
  23. '/user/sign_up',
  24. '/user/login',
  25. '/user/forgot_password'
  26. ]
  27. url_static_arr = [
  28. '.js',
  29. '.css',
  30. '.cscc',
  31. 'None',
  32. 'about:blank'
  33. ]
  34. def is_static(url):
  35. url = str(url)
  36. for static in url_static_arr:
  37. if url.endswith(static):
  38. return True
  39. return False
  40. def is_robot(url):
  41. url = str(url)
  42. for robot in url_robot_arr:
  43. if url.startswith(robot):
  44. return True
  45. return False
  46. def getlinks(url):
  47. pages = requests.get(url)
  48. html = pages.text
  49. soup = BeautifulSoup(html, 'html.parser')
  50. links = soup.find_all('a')
  51. return filterlinks(links)
  52. def filterlinks(links):
  53. tmplinks = []
  54. for link in links:
  55. url = str(link.get('href'))
  56. if url is None or url == '':
  57. continue
  58. ishttp = url.startswith('http')
  59. ismine = url.startswith(url_root)
  60. if ishttp and (not ismine):
  61. continue
  62. if url.startswith('#') or '/' == url.strip():
  63. continue
  64. if url.startswith("?"):
  65. continue
  66. if is_static(url):
  67. continue
  68. if is_robot(url):
  69. continue
  70. if not ishttp and url.startswith("/"):
  71. url = url_root + url
  72. elif not ishttp:
  73. url = url_root + '/' + url
  74. tmplinks.append(url)
  75. reslinks = list(set(tmplinks))
  76. return reslinks
  77. def parser(url_arr, depth):
  78. url_tmp = []
  79. if(depth >= max_depth):
  80. return
  81. depth += 1
  82. print('parsing depth:', depth, ' parse urls:', len(url_arr))
  83. for urlmine in url_arr:
  84. links = getlinks(urlmine)
  85. url_tmp.extend(links)
  86. url_tmp = list(set(url_tmp).difference(set(url_res_final)))
  87. url_res_final.extend(url_tmp)
  88. parser(url_tmp, depth)
  89. parser(url_mine_list, 0)
  90. # print('parser result size:', len(url_res_final))
  91. # for url in url_res_final:
  92. # print(url)