File size: 2,103 Bytes
21db53c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from datetime import datetime
from typing import Optional
from uuid import UUID

from numpy import ndarray
from pydantic import BaseModel, Field, ConfigDict


class ImageData(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True, extra='ignore')

    id: UUID
    url: Optional[str] = None
    thumbnail_url: Optional[str] = None
    ocr_text: Optional[str] = None
    image_vector: Optional[ndarray] = Field(None, exclude=True)
    text_contain_vector: Optional[ndarray] = Field(None, exclude=True)
    index_date: datetime
    width: Optional[int] = None
    height: Optional[int] = None
    aspect_ratio: Optional[float] = None
    starred: Optional[bool] = False
    categories: Optional[list[str]] = []
    local: Optional[bool] = False
    local_thumbnail: Optional[bool] = False
    format: Optional[str] = None  # required for s3 local storage

    @property
    def ocr_text_lower(self) -> str | None:
        if self.ocr_text is None:
            return None
        return self.ocr_text.lower()

    @property
    def payload(self):
        result = self.model_dump(exclude={'id', 'index_date'})
        # Qdrant database cannot accept datetime object, so we have to convert it to string
        result['index_date'] = self.index_date.isoformat()
        # Qdrant doesn't support case-insensitive search, so we need to store a lowercase version of the text
        result['ocr_text_lower'] = self.ocr_text_lower
        return result

    @classmethod
    def from_payload(cls, img_id: str, payload: dict,
                     image_vector: Optional[ndarray] = None, text_contain_vector: Optional[ndarray] = None):
        # Convert the datetime string back to datetime object
        index_date = datetime.fromisoformat(payload['index_date'])
        del payload['index_date']
        return cls(id=UUID(img_id),
                   index_date=index_date,
                   **payload,
                   image_vector=image_vector if image_vector is not None else None,
                   text_contain_vector=text_contain_vector if text_contain_vector is not None else None)