File size: 1,372 Bytes
21e639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
from .base import Parser
from datetime import datetime
from .....configs import COOKIES
from logging import LoggerAdapter
from typing import Callable, List
from dataclasses import dataclass
from scrapy import Request, Selector
from scrapy.http.response.html import HtmlResponse


@dataclass
class YearBackwardIndexParser(Parser):
    """
    The YearBackwardIndexParser object parses the index.html files from a year in the
    past to the current one.
    """

    since: str
    title_tags: List[Selector]
    logger: LoggerAdapter

    def parse(
        self, response: HtmlResponse, callback: Callable, self_callback: Callable
    ):
        for title_tag in reversed(list(self.title_tags.items())):
            title = title_tag.text()
            post_url = title_tag.attr("href")
            timestamp = re.search(r"(\d{10})", post_url).group(1)

            if int(timestamp) < int(self.since):
                return None

            self.logger.info(
                f"+ {title}, {post_url}, {datetime.fromtimestamp(int(timestamp))}"
            )

            yield Request(post_url, cookies=COOKIES, callback=callback)

        prev_url = response.dom('.btn.wide:contains("上頁")').attr("href")
        self.logger.info(f"index link: {prev_url}")

        if prev_url:
            yield Request(prev_url, cookies=COOKIES, callback=self_callback)