File size: 3,457 Bytes
5e433de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from langchain_core.documents import Document
from typing import Tuple, List
import pandas as pd
import re

class Metadata:
    def __init__(self, ref_database_path: str):
        self.df = pd.read_csv(ref_database_path)

    def extract_ref_from_metadata(self, meta_data: dict) -> List[str]:
        """Extract references from metadata of images and tables."""

        meta_data_dict = {}
        meta_data_dict["source"] = meta_data.get("source", "")
        self_ref = meta_data.get("self_ref", "")
        parent_ref = meta_data.get("parent_ref", "")
        child_ref = meta_data.get("child_ref", "")

        formated_self_ref = re.split(r'[,\s]+', self_ref or "")
        formated_parent_ref = re.split(r'[,\s]+', parent_ref or "")
        formated_child_ref = re.split(r'[,\s]+', child_ref or "")

        filtered_self_ref_ids = [item for item in formated_self_ref 
                                 if item.startswith('#/tables/') or item.startswith('#/pictures/')]
        filtered_parent_ref_ids = [item for item in formated_parent_ref 
                                   if item.startswith('#/tables/') or item.startswith('#/pictures/')]
        filtered_child_ref_ids = [item for item in formated_child_ref 
                                  if item.startswith('#/tables/') or item.startswith('#/pictures/')]

        # Combine all filtered references into a set (to avoid duplicates)
        all_filtered_references = set(filtered_self_ref_ids + 
                                       filtered_parent_ref_ids + 
                                       filtered_child_ref_ids)
        if len(all_filtered_references) > 0:
            meta_data_dict["self_ref"] = list(all_filtered_references)
            return meta_data_dict
        
    def extract_all_ref_from_retrived_chunks(self, chunks: Document) -> dict:
        all_metadata = {}
        # Example: Iterate over documents and add extracted metadata to the new dictionary
        for idx, doc in enumerate(chunks):  # Assuming `docs` is a list of documents
            meta_data = doc.metadata  # Extract metadata from the document
            extracted_ref_data = self.extract_ref_from_metadata(meta_data)  # Extract references
            
            # Add the extracted metadata to the all_metadata dictionary
            if extracted_ref_data:
                all_metadata[f"doc_{idx}"] = extracted_ref_data
            
        return all_metadata


    def get_data_from_ref(self, chunks:Document) -> Tuple[str, str]:
        """Extract tables and pictures from metadata using references."""


        tables = {}
        images = {}

        all_metadata = self.extract_all_ref_from_retrived_chunks(chunks)

        for meta in all_metadata.values():
            source = meta.get("source", "")
            ref = meta.get("self_ref", [])

            for r in ref:
                reference_rows = self.df[
                    (self.df['source'] == source) &
                    (self.df['self_ref'].isin([r]))
                ]

                if not reference_rows.empty:
                    chunk_type = reference_rows["chunk_type"].values[0]
                    page_content = reference_rows["page_content"].values[0]
                    
                    if chunk_type == "table":
                        tables[r] = page_content
                    elif chunk_type == "picture":
                        images[r] = page_content
        
        return tables, images