File size: 1,179 Bytes
21e639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
from .base import Parser
from scrapy import Request
from typing import Callable
from .....configs import COOKIES
from logging import LoggerAdapter
from dataclasses import dataclass
from scrapy.http.response.html import HtmlResponse


@dataclass
class LatestIndexParser(Parser):
    """
    The LatestIndexParser objects parses the latest index.html file.
    """

    logger: LoggerAdapter

    def get_latest_index(self, prev_url: str):
        return int(re.search(r"index(\d{1,6})\.html", prev_url).group(1))

    def get_board(self, url: str):
        return re.search(r"www\.ptt\.cc\/bbs\/([\w\d\-_]{1,30})\/", url).group(1)

    def parse(self, response: HtmlResponse, callback: Callable):
        prev_url = response.css('.btn.wide:contains("上頁")::attr(href)').get()
        self.logger.info(f"index link: {prev_url}")

        latest_index = self.get_latest_index(prev_url)
        board = self.get_board(response.url)

        for index in range(1, latest_index + 1):
            url = f"https://www.ptt.cc/bbs/{board}/index{index}.html"
            self.logger.info(f"index link: {url}")

            yield Request(url, cookies=COOKIES, callback=callback)