Python Crawler Repo

Repo for crawler I used to crawl novels on the web.
main.py
# -*- coding:utf-8 -*-
import requests
import re
import time
import random

# ======== All variables =========
# Book url
book_url = 'https://xxx.com/book/123/'
# Book single chapter url
single_chapter_url = ''
# Book all chapter url
chapter_index_url_list = []
# Save book index url to txt file path
novel_index_path = 'novel_index.txt'
# Save the book to the txt file path
novel_path = 'novel.txt'
# Counter, used to count chapters crawled.
count = 0
# Timer, used to count total crawling time.
timeCount = 0
# Simulate browser header to lower the risk being recognised as crawler
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/500.66 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/500.66'
}


# ========= Regex Pattern =========
# Regex for extract book index url
single_chapter_pattern = '<li num="(\d+)"><a href="(.*?)">(.*?)<\/a><\/li>'
# Regex for extract book title
title_regex = '<h1 class="xxx">(.*?)</h1>'
# Regex for extract book contents
content_regex = '<br />\s*&emsp;&emsp;(.*?)\s*<br />'


# ====== Test code ======
# Check for book index url
# response_1 = requests.get(book_url, headers=headers)
# response_1.encoding = 'gbk'
# print(response_1.text)

# Check for title and contents
# response_2 = requests.get('https://xxx.com/txt/123/123', headers=headers)
# response_2.encoding = 'gbk'
# print(response_2.text)
# title = re.findall(title_regex, response_2.text)
# content = re.findall(content_regex, response_2.text)
# print(title)
# print(content)


# ========= Extract all book index url =========
# Using requests library get method to send request to book website and receive response to response_1
response_1 = requests.get(book_url, headers=headers)
# Website encoding
response_1.encoding = 'utf-8'
# Extract all book index url
matches = re.finditer(single_chapter_pattern, response_1.text)
# Extracted content
results = [(match.group(1), match.group(2)) for match in matches]
if results:
    # Only want the link inside the list and ignore the chapter number so use url to store chapter url
    for cn, url in results:
        single_chapter_url = url
        chapter_index_url_list.append(single_chapter_url)
else:
    print("No match found.")
print("In total " + str(len(chapter_index_url_list)) + "chapters.")


# ========= Save book index url to txt file =========
# with open(novel_index_path, 'a+', encoding="utf-8") as file:
#     for c in chapter_index_url_list:
#         file.write(c + '\n')


# ========= Extract title and content =========
print("Start crawling...")
# Open txt file for writing，a+ means in append mode
with open(novel_path, 'a+', encoding="gbk") as file:
    # Traverse book index url list
    for x in chapter_index_url_list:
        # Send request to book index url until no error code 503 - Need to improve
        while 1:
            response_2 = requests.get(x, headers=headers)
            if '503 Service Temporarily Unavailable' not in response_2.text:
                break
            else:
                print('Data lose, restart in 3s')
                time.sleep(3)
        # Encoding
        response_2.encoding = "utf-8"
        # Extract title in list format [ ]
        title = re.findall(title_regex, response_2.text)
        # Extract content in list format [ ]
        contents = re.findall(content_regex, response_2.text)
        # Write title to txt file
        file.write(title[0] + '\n')
        # Write content to txt file, line by line
        for content in contents:
            file.write(content + '\n' + '\n')
        file.write('\n' + '\n')
        # Add 1 for each successful write/crawling
        count += 1
        # Print out current progress
        print('Chapter {}, Title: {}, Done！'.format(count, title[0]))
        # Generate a random number of time to simulate a human for different reading times
        randomSleepTime = random.randint(12, 17)	
        time.sleep(randomSleepTime)
        # Record total crawling time
        timeCount += randomSleepTime 
# Close file
file.close()

print('Finished crawling！In total cost{}seconds'.format(timeCount))
Need to improve:
Check empty for title and contents writing to txt file.
A more gentle way to send the request to the server when a server error happens.
About this Post

This post is written by Andy, licensed under CC BY-NC 4.0.
#Python #Crawler
Python Crawler Repo

About this Post

command not found:

“应用程序”已损坏，无法打开。解决方案