Quintino Fernandes commited on
Commit
e8db2ab
·
1 Parent(s): 21f3018

Different query(Original SQL one)

Browse files
database/query.py CHANGED
@@ -1,27 +1,17 @@
1
  import os
2
  from typing import List, Dict, Optional
3
- import vecs
4
  from datetime import datetime
 
 
5
 
6
  class DatabaseService:
7
  def __init__(self):
8
  # Connection parameters
9
- self.DB_HOST = os.getenv("SUPABASE_HOST", "db.daxquaudqidyeirypexa.supabase.co")
10
- self.DB_PORT = os.getenv("DB_PORT", "5432")
11
  self.DB_NAME = os.getenv("DB_NAME", "postgres")
12
- self.DB_USER = os.getenv("DB_USER", "postgres")
13
- self.DB_PASSWORD = os.getenv("DB_PASSWORD", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImRheHF1YXVkcWlkeWVpcnlwZXhhIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQzOTIzNzcsImV4cCI6MjA1OTk2ODM3N30.3qB-GfiCoqXEpbNfqV3iHiqOLr8Ex9nPVr6p9De5Hdc")
14
-
15
- # Create vecs client
16
- self.vx = vecs.create_client(
17
- f"postgresql://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
18
- )
19
-
20
- # Get or create the collection
21
- self.articles = self.vx.get_or_create_collection(
22
- name="articles",
23
- dimension=384 # Match your embedding model's output dimension
24
- )
25
 
26
  async def semantic_search(
27
  self,
@@ -29,62 +19,143 @@ class DatabaseService:
29
  start_date: Optional[datetime] = None,
30
  end_date: Optional[datetime] = None,
31
  topic: Optional[str] = None,
32
- entities: Optional[List[str]] = None, # Add entities parameter
33
  limit: int = 10
34
  ) -> List[Dict[str, any]]:
35
  try:
36
- # Base vector search
37
- filters = self._build_filters(start_date, end_date, topic)
38
-
39
- # Add entity filter if entities are provided
40
- if entities:
41
- filters["entities"] = {"$in": entities}
42
-
43
- results = self.articles.query(
44
- data=query_embedding,
45
- limit=limit,
46
- filters=filters,
47
- measure="cosine_distance" # or "inner_product", "l2_distance"
48
- )
49
-
50
- # Format results with metadata
51
- formatted_results = []
52
- for article_id, distance in results:
53
- metadata = self.articles.fetch(ids=[article_id])[0]["metadata"]
54
- formatted_results.append({
55
- "id": article_id,
56
- "url": metadata.get("url"),
57
- "content": metadata.get("content"),
58
- "date": metadata.get("date"),
59
- "topic": metadata.get("topic"),
60
- "distance": float(distance),
61
- "similarity": 1 - float(distance) # Convert to similarity score
62
- })
63
-
64
- return formatted_results
65
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
- print(f"Vector search error: {e}")
68
  return []
69
 
70
- def _build_filters(
71
- self,
72
- start_date: Optional[datetime],
73
- end_date: Optional[datetime],
74
- topic: Optional[str]
75
- ) -> Dict[str, any]:
76
- filters = {}
77
-
78
- if start_date and end_date:
79
- filters["date"] = {
80
- "$gte": start_date.isoformat(),
81
- "$lte": end_date.isoformat()
82
- }
83
-
84
- if topic:
85
- filters["topic"] = {"$eq": topic}
86
-
87
- return filters
88
-
89
  async def close(self):
90
- self.vx.disconnect()
 
 
1
  import os
2
  from typing import List, Dict, Optional
 
3
  from datetime import datetime
4
+ import psycopg2
5
+ from psycopg2 import sql
6
 
7
  class DatabaseService:
8
  def __init__(self):
9
  # Connection parameters
10
+ self.DB_HOST = os.getenv("SUPABASE_HOST", "aws-0-eu-west-3.pooler.supabase.com")
11
+ self.DB_PORT = os.getenv("DB_PORT", "6543")
12
  self.DB_NAME = os.getenv("DB_NAME", "postgres")
13
+ self.DB_USER = os.getenv("DB_USER")
14
+ self.DB_PASSWORD = os.getenv("DB_PASSWORD")
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  async def semantic_search(
17
  self,
 
19
  start_date: Optional[datetime] = None,
20
  end_date: Optional[datetime] = None,
21
  topic: Optional[str] = None,
22
+ entities: Optional[List[str]] = None,
23
  limit: int = 10
24
  ) -> List[Dict[str, any]]:
25
  try:
26
+ with psycopg2.connect(
27
+ user=self.DB_USER,
28
+ password=self.DB_PASSWORD,
29
+ host=self.DB_HOST,
30
+ port=self.DB_PORT,
31
+ dbname=self.DB_NAME
32
+ ) as conn:
33
+ with conn.cursor() as cursor:
34
+ # Enable unaccent extension if not already enabled
35
+ cursor.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
36
+
37
+ # Base query with date range and topic filters
38
+ base_query = sql.SQL('''
39
+ WITH filtered_articles AS (
40
+ SELECT article_id
41
+ FROM articles.articles
42
+ WHERE 1=1
43
+ ''')
44
+
45
+ # Add date range filter (if both start and end dates provided)
46
+ if start_date and end_date:
47
+ base_query = sql.SQL('{}{}').format(
48
+ base_query,
49
+ sql.SQL(' AND date BETWEEN {} AND {}').format(
50
+ sql.Literal(start_date),
51
+ sql.Literal(end_date)
52
+ )
53
+ )
54
+
55
+ # Add topic filter (if provided)
56
+ if topic:
57
+ base_query = sql.SQL('{}{}').format(
58
+ base_query,
59
+ sql.SQL(' AND topic = {}').format(sql.Literal(topic))
60
+ )
61
+
62
+ base_query = sql.SQL('{} {}').format(
63
+ base_query,
64
+ sql.SQL(')')
65
+ )
66
+
67
+ # Add entity filter (if entities exist)
68
+ if entities:
69
+ entity_conditions = sql.SQL(" OR ").join(
70
+ sql.SQL("""
71
+ (LOWER(UNACCENT(word)) = LOWER(UNACCENT({}))
72
+ AND entity_group = {})
73
+ """).format(
74
+ sql.Literal(e[0]), # Lowercase + unaccented entity text
75
+ sql.Literal(e[1]) # Original entity label (case-sensitive)
76
+ ) for e in entities
77
+ )
78
+
79
+ final_query = sql.SQL('''
80
+ {base_query},
81
+ target_articles AS (
82
+ SELECT DISTINCT article_id
83
+ FROM articles.ner
84
+ WHERE ({entity_conditions})
85
+ AND article_id IN (SELECT article_id FROM filtered_articles)
86
+ )
87
+ SELECT
88
+ a.content,
89
+ a.embedding <=> {embedding}::vector AS distance,
90
+ a.date,
91
+ a.topic
92
+ FROM articles.articles a
93
+ JOIN target_articles t ON a.article_id = t.article_id
94
+ ORDER BY distance
95
+ LIMIT {limit}
96
+ ''').format(
97
+ base_query=base_query,
98
+ entity_conditions=entity_conditions,
99
+ embedding=sql.Literal(query_embedding),
100
+ limit=sql.Literal(limit)
101
+ )
102
+ else:
103
+ final_query = sql.SQL('''
104
+ {base_query}
105
+ SELECT
106
+ a.content,
107
+ a.embedding <=> {embedding}::vector AS distance,
108
+ a.date,
109
+ a.topic
110
+ FROM articles.articles a
111
+ JOIN filtered_articles f ON a.article_id = f.article_id
112
+ ORDER BY distance
113
+ LIMIT {limit}
114
+ ''').format(
115
+ base_query=base_query,
116
+ embedding=sql.Literal(query_embedding),
117
+ limit=sql.Literal(limit)
118
+ )
119
+
120
+ cursor.execute(final_query)
121
+ articles = cursor.fetchall()
122
+
123
+ # Fallback: Retry with fewer filters if no results
124
+ if not articles:
125
+ print("No articles found with all filters. Relaxing filters...")
126
+ fallback_query = sql.SQL('''
127
+ SELECT
128
+ content,
129
+ embedding <=> {}::vector AS distance,
130
+ date,
131
+ topic
132
+ FROM articles.articles
133
+ ORDER BY distance
134
+ LIMIT {limit}
135
+ ''').format(
136
+ sql.Literal(query_embedding),
137
+ limit=sql.Literal(limit)
138
+ )
139
+ cursor.execute(fallback_query)
140
+ articles = cursor.fetchall()
141
+
142
+ # Format results
143
+ formatted_results = [
144
+ {
145
+ "content": content,
146
+ "distance": distance,
147
+ "date": art_date,
148
+ "topic": art_topic
149
+ }
150
+ for content, distance, art_date, art_topic in articles
151
+ ]
152
+
153
+ return formatted_results
154
+
155
  except Exception as e:
156
+ print(f"Database query error: {e}")
157
  return []
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  async def close(self):
160
+ # No persistent connection to close in psycopg2
161
+ pass
database/query_processor.py CHANGED
@@ -63,6 +63,5 @@ class QueryProcessor:
63
 
64
  return {
65
  "summary": summary,
66
- "key_sentences": key_sentences,
67
  "articles": articles
68
  }
 
63
 
64
  return {
65
  "summary": summary,
 
66
  "articles": articles
67
  }
requirements.txt CHANGED
@@ -10,5 +10,4 @@ numpy
10
  pandas
11
  scipy
12
  psycopg2
13
- vecs
14
  sentencepiece
 
10
  pandas
11
  scipy
12
  psycopg2
 
13
  sentencepiece