from collections import defaultdict from typing import Literal, Tuple from qdrant_client.models import ScoredPoint from qdrant_client.conversions import common_types as types from src.backend.database.qdrant import ScientificPapersMainSchema, ScientificPapersChunksSchema class QdrantArticleResponse: def __init__(self, query, document_point: ScoredPoint, highlight_points: list[ScoredPoint]): self.document_point = document_point self.highlight_points = highlight_points self.query = query self.paragraphs_per_section = {} self.highlighted_paragraphs = {} self._section_names_ordered = [] self.article_id = "" self.abstract: list[str] = [] self._relevant_paragraphs = [] self._html_article = "" self._process_points() def _process_points(self): self.article_id = self.document_point.payload[ScientificPapersMainSchema.ARTICLE_ID] section_names = self.document_point.payload[ScientificPapersMainSchema.SECTION_NAMES] sections = self.document_point.payload[ScientificPapersMainSchema.SECTIONS] self._section_names_ordered = section_names paragraphs_per_section = {section_name: section for section_name, section in zip(section_names, sections)} highlighted_paragraphs = defaultdict(list) for point in self.highlight_points: paragraph_id = point.payload[ScientificPapersChunksSchema.PARAGRAPH_ID] section_name = point.payload[ScientificPapersChunksSchema.SECTION_NAME] highlighted_paragraphs[section_name].append(paragraph_id) self._relevant_paragraphs.append((section_name, paragraph_id)) self.paragraphs_per_section = paragraphs_per_section self.highlighted_paragraphs = highlighted_paragraphs @staticmethod def section_html(paragraphs: list[str]): _paragraph_html = '\n'.join([f'

{i}

' for i in paragraphs]) section = f"""
{_paragraph_html}
""" return section @property def html_article(self): if self._html_article: return self._html_article def article_header_html(text, h_tag: Literal["h1", "h2", "h3"] = "h2"): html_text = f"""
<{h_tag}> {text}
""" return html_text def section_html(paragraphs: list[str]): _section_html = '\n'.join([f'

{i}

' for i in paragraphs]) section = f"""
{_section_html}
""" return section def mark_html(text): mark_tag = 'mark class="highlight-paragraph"' return f"<{mark_tag}>{text}" article_dict = {i: [k.strip() for k in j] for i, j in self.paragraphs_per_section.items()} for section, paragraph_ids in self.highlighted_paragraphs.items(): for paragraph_id in paragraph_ids: for i in range(-1, 2): extra_ind = paragraph_id + i if 0 <= extra_ind < len(article_dict[section]): paragraph = article_dict[section][extra_ind] article_dict[section][extra_ind] = mark_html(paragraph) article_html = '\n'.join([ f'{article_header_html(i, "h2")}\n{section_html(article_dict[i])}' for i in self._section_names_ordered]) article_html = f"
{article_html}
" self._html_article = article_html return article_html @property def html_most_relevant_paragraph(self): section, paragraph_id = self._relevant_paragraphs[0] paragraph = [] for i in range(-1, 2): extra_ind = paragraph_id + i if 0 <= extra_ind < len(self.paragraphs_per_section[section]): c_paragraph = self.paragraphs_per_section[section][extra_ind] paragraph.append(c_paragraph) return '\n'.join(paragraph) @property def article_link(self): link = "https://pmc.ncbi.nlm.nih.gov/articles/{}/".format(self.article_id) link_html = f'View full article on external site: {self.article_id}' return link_html class QdrantQueryResponses: def __init__(self, query: str, query_response: Tuple[types.QueryResponse, dict[str, types.QueryResponse]]): self.query_responses: list[QdrantArticleResponse] = self._process_response(query, query_response) @staticmethod def _process_response(query, query_responses: Tuple[types.QueryResponse, dict[str, types.QueryResponse]]): document_responses, highlight_dict = query_responses qdrant_article_responses = [] for document in document_responses.points: highlight_points = highlight_dict[document.payload[ScientificPapersMainSchema.ARTICLE_ID]].points qdrant_article_responses.append(QdrantArticleResponse(query, document, highlight_points)) return qdrant_article_responses def __getitem__(self, idx): return self.query_responses[idx]