1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
| import requests import re import time import random
book_url = 'https://xxx.com/book/123/'
single_chapter_url = ''
chapter_index_url_list = []
novel_index_path = 'novel_index.txt'
novel_path = 'novel.txt'
count = 0
timeCount = 0
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/500.66 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/500.66' }
single_chapter_pattern = '<li num="(\d+)"><a href="(.*?)">(.*?)<\/a><\/li>'
title_regex = '<h1 class="xxx">(.*?)</h1>'
content_regex = '<br />\s*  (.*?)\s*<br />'
response_1 = requests.get(book_url, headers=headers)
response_1.encoding = 'utf-8'
matches = re.finditer(single_chapter_pattern, response_1.text)
results = [(match.group(1), match.group(2)) for match in matches] if results: for cn, url in results: single_chapter_url = url chapter_index_url_list.append(single_chapter_url) else: print("No match found.") print("In total " + str(len(chapter_index_url_list)) + "chapters.")
print("Start crawling...")
with open(novel_path, 'a+', encoding="gbk") as file: for x in chapter_index_url_list: while 1: response_2 = requests.get(x, headers=headers) if '503 Service Temporarily Unavailable' not in response_2.text: break else: print('Data lose, restart in 3s') time.sleep(3) response_2.encoding = "utf-8" title = re.findall(title_regex, response_2.text) contents = re.findall(content_regex, response_2.text) file.write(title[0] + '\n') for content in contents: file.write(content + '\n' + '\n') file.write('\n' + '\n') count += 1 print('Chapter {}, Title: {}, Done!'.format(count, title[0])) randomSleepTime = random.randint(12, 17) time.sleep(randomSleepTime) timeCount += randomSleepTime
file.close()
print('Finished crawling!In total cost{}seconds'.format(timeCount))
|