Spaces:

jeffeux
/

assignment-1-jeffeuxMartin

Runtime error

File size: 1,179 Bytes

21e639d

import re
from .base import Parser
from scrapy import Request
from typing import Callable
from .....configs import COOKIES
from logging import LoggerAdapter
from dataclasses import dataclass
from scrapy.http.response.html import HtmlResponse


@dataclass
class LatestIndexParser(Parser):
    """
    The LatestIndexParser objects parses the latest index.html file.
    """

    logger: LoggerAdapter

    def get_latest_index(self, prev_url: str):
        return int(re.search(r"index(\d{1,6})\.html", prev_url).group(1))

    def get_board(self, url: str):
        return re.search(r"www\.ptt\.cc\/bbs\/([\w\d\-_]{1,30})\/", url).group(1)

    def parse(self, response: HtmlResponse, callback: Callable):
        prev_url = response.css('.btn.wide:contains("上頁")::attr(href)').get()
        self.logger.info(f"index link: {prev_url}")

        latest_index = self.get_latest_index(prev_url)
        board = self.get_board(response.url)

        for index in range(1, latest_index + 1):
            url = f"https://www.ptt.cc/bbs/{board}/index{index}.html"
            self.logger.info(f"index link: {url}")

            yield Request(url, cookies=COOKIES, callback=callback)