Spaces:

darpanaswal
/

Patent_Retrieval

Configuration error

App Files Files Community

darpanaswal commited on Apr 10

Commit

fd6b733

verified ·

1 Parent(s): a0a8763

Update cross_encoder_reranking_train.py

Browse files

Files changed (1) hide show

cross_encoder_reranking_train.py +131 -48

cross_encoder_reranking_train.py CHANGED Viewed

@@ -13,8 +13,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load embedder once
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-embedder = embedder.to(device)
 def embed_text_list(texts):
@@ -62,6 +61,28 @@ def process_single_patent(patent_dict):
         "features": rank_by_centrality(top_features),
     }
 def load_json_file(file_path):
     """Load JSON data from a file"""
     with open(file_path, 'r') as f:
@@ -153,6 +174,22 @@ def extract_text(content_dict, text_type="full"):
         return " ".join(all_text)
     return ""
@@ -166,67 +203,113 @@ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tenso
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
 def get_detailed_instruct(task_description: str, query: str) -> str:
-    """Create an instruction-formatted query"""
-    return f'Instruct: {task_description}\nQuery: {query}'
-def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=8, max_length=2048):
-    """
-    Rerank document texts based on query text using cross-encoder model
-    Parameters:
-    query_text (str): The query text
-    doc_texts (list): List of document texts
-    model: The cross-encoder model
-    tokenizer: The tokenizer for the model
-    batch_size (int): Batch size for processing
-    max_length (int): Maximum sequence length
-    Returns:
-    list: Indices of documents sorted by relevance score (descending)
-    """
-    device = next(model.parameters()).device
-    scores = []
-    # Format query with instruction
-    task_description = 'Re-rank a set of retrieved patents based on their relevance to a given query patent. The task aims to refine the order of patents by evaluating their semantic similarity to the query patent, ensuring that the most relevant patents appear at the top of the list.'
-    instructed_query = get_detailed_instruct(task_description, query_text)
-    # Process in batches to avoid OOM
-    for i in tqdm(range(0, len(doc_texts), batch_size), desc="Scoring documents", leave=False):
-        batch_docs = doc_texts[i:i+batch_size]
-        # Prepare input pairs for the batch
-        input_texts = [instructed_query] + batch_docs
-        # Tokenize
-        with torch.no_grad():
-            batch_dict = tokenizer(input_texts, max_length=max_length, padding=True,
-                                  truncation=True, return_tensors='pt').to(device)
-            # Get embeddings
-            outputs = model(**batch_dict)
-            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
-            # Normalize embeddings
-            embeddings = F.normalize(embeddings, p=2, dim=1)
-            # Calculate similarity scores between query and documents
-            batch_scores = (embeddings[0].unsqueeze(0) @ embeddings[1:].T).squeeze(0) * 100
-            scores.extend(batch_scores.cpu().tolist())
-    # Create list of (index, score) tuples for sorting
-    indexed_scores = list(enumerate(scores))
-    # Sort by score in descending order
-    indexed_scores.sort(key=lambda x: x[1], reverse=True)
-    # Return sorted indices
     return [idx for idx, _ in indexed_scores]
 def main():
     base_directory = os.getcwd()
     parser = argparse.ArgumentParser(description='Re-rank patents using cross-encoder scoring (training queries only)')
     parser.add_argument('--pre_ranking', type=str, default='shuffled_pre_ranking.json',
                         help='Path to pre-ranking JSON file')
@@ -241,7 +324,7 @@ def main():
     parser.add_argument('--queries_list', type=str, default='test_queries.json',
                         help='Path to training queries JSON file')
     parser.add_argument('--text_type', type=str, default='TA',
-                        choices=['TA', 'claims', 'description', 'full', 'tac1', 'smart'],
                         help='Type of text to use for scoring')
     parser.add_argument('--model_name', type=str, default='intfloat/e5-large-v2',
                         help='Name of the cross-encoder model')
@@ -252,7 +335,7 @@ def main():
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                         help='Device to use (cuda/cpu)')
     parser.add_argument('--base_dir', type=str,
-                        default=f'{base_directory}/Patent_Retrieval/datasets',
                         help='Base directory for data files')
     args = parser.parse_args()

 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load embedder once
+embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to(device)
 def embed_text_list(texts):
         "features": rank_by_centrality(top_features),
     }
+def refined_process_single_patent(patent_dict, top_n=10):
+    abstract = patent_dict.get("pa01", "")
+    title = patent_dict.get("title", "")
+    context = f"{title} {abstract}"
+    context_emb = embed_text_list([context])[0]
+    claims = [v for k, v in patent_dict.items() if k.startswith("c-en")]
+    paragraphs = [v for k, v in patent_dict.items() if k.startswith("p")]
+    features = [v for k, v in patent_dict.get("features", {}).items()]
+    def semantic_rank(items, context_emb):
+        embeddings = embed_text_list(items)
+        scores = cosine_similarity([context_emb], embeddings)[0]
+        ranked_items = [item for item, _ in sorted(zip(items, scores), key=lambda x: x[1], reverse=True)]
+        return ranked_items
+    return {
+        "claims": semantic_rank(claims, context_emb)[:top_n],
+        "paragraphs": semantic_rank(paragraphs, context_emb)[:top_n],
+        "features": semantic_rank(features, context_emb)[:top_n],
+    }
 def load_json_file(file_path):
     """Load JSON data from a file"""
     with open(file_path, 'r') as f:
         return " ".join(all_text)
+    elif text_type == "smart2":
+        filtered_dict = refined_process_single_patent(content_dict)
+        all_text = []
+        # Context with title and abstract
+        if "title" in content_dict:
+            all_text.append(content_dict["title"])
+        if "pa01" in content_dict:
+            all_text.append(content_dict["pa01"])
+        # Add claims, paragraphs, and features
+        all_text.extend(filtered_dict["claims"])
+        all_text.extend(filtered_dict["paragraphs"])
+        all_text.extend(filtered_dict["features"])
+        return " ".join(all_text)
     return ""
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+# def get_detailed_instruct(task_description: str, query: str) -> str:
+#     """Create an instruction-formatted query"""
+#     return f'Instruct: {task_description}\nQuery: {query}'
 def get_detailed_instruct(task_description: str, query: str) -> str:
+    return (
+        f"Instruct: Evaluate the semantic and technical similarity between two patent documents."
+        f" Prioritize highly similar claims, technical implementations, and shared functionalities."
+        f"\nQuery: {query}"
+    )
+def hybrid_score(cross_encoder_score, semantic_score, weight_cross=0.7, weight_semantic=0.3):
+    return (weight_cross * cross_encoder_score) + (weight_semantic * semantic_score)
+# def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=8, max_length=2048):
+#     """
+#     Rerank document texts based on query text using cross-encoder model
+#     Parameters:
+#     query_text (str): The query text
+#     doc_texts (list): List of document texts
+#     model: The cross-encoder model
+#     tokenizer: The tokenizer for the model
+#     batch_size (int): Batch size for processing
+#     max_length (int): Maximum sequence length
+#     Returns:
+#     list: Indices of documents sorted by relevance score (descending)
+#     """
+#     device = next(model.parameters()).device
+#     scores = []
+#     # Format query with instruction
+#     task_description = 'Re-rank a set of retrieved patents based on their relevance to a given query patent. The task aims to refine the order of patents by evaluating their semantic similarity to the query patent, ensuring that the most relevant patents appear at the top of the list.'
+#     instructed_query = get_detailed_instruct(task_description, query_text)
+#     # Process in batches to avoid OOM
+#     for i in tqdm(range(0, len(doc_texts), batch_size), desc="Scoring documents", leave=False):
+#         batch_docs = doc_texts[i:i+batch_size]
+#         # Prepare input pairs for the batch
+#         input_texts = [instructed_query] + batch_docs
+#         # Tokenize
+#         with torch.no_grad():
+#             batch_dict = tokenizer(input_texts, max_length=max_length, padding=True,
+#                                   truncation=True, return_tensors='pt').to(device)
+#             # Get embeddings
+#             outputs = model(**batch_dict)
+#             embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+#             # Normalize embeddings
+#             embeddings = F.normalize(embeddings, p=2, dim=1)
+#             # Calculate similarity scores between query and documents
+#             batch_scores = (embeddings[0].unsqueeze(0) @ embeddings[1:].T).squeeze(0) * 100
+#             scores.extend(batch_scores.cpu().tolist())
+#     # Create list of (index, score) tuples for sorting
+#     indexed_scores = list(enumerate(scores))
+#     # Sort by score in descending order
+#     indexed_scores.sort(key=lambda x: x[1], reverse=True)
+#     # Return sorted indices
+#     return [idx for idx, _ in indexed_scores]
+def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=8, max_length=2048):
+    device = next(model.parameters()).device
+    cross_scores = []
+    query_emb = embed_text_list([query_text])[0]
+    instructed_query = get_detailed_instruct("", query_text)
+    for i in tqdm(range(0, len(doc_texts), batch_size), desc="Scoring documents", leave=False):
+        batch_docs = doc_texts[i:i+batch_size]
+        input_texts = [instructed_query] + batch_docs
+        with torch.no_grad():
+            batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt').to(device)
+            outputs = model(**batch_dict)
+            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            batch_cross_scores = (embeddings[0].unsqueeze(0) @ embeddings[1:].T).squeeze(0).cpu().numpy()
+            cross_scores.extend(batch_cross_scores)
+    # Semantic scores
+    doc_embeddings = embed_text_list(doc_texts)
+    semantic_scores = cosine_similarity([query_emb], doc_embeddings)[0]
+    # Hybrid scores
+    hybrid_scores = [hybrid_score(c, s) for c, s in zip(cross_scores, semantic_scores)]
+    indexed_scores = list(enumerate(hybrid_scores))
+    indexed_scores.sort(key=lambda x: x[1], reverse=True)
     return [idx for idx, _ in indexed_scores]
 def main():
     base_directory = os.getcwd()
+    base_directory += "/Patent_Retrieval"
     parser = argparse.ArgumentParser(description='Re-rank patents using cross-encoder scoring (training queries only)')
     parser.add_argument('--pre_ranking', type=str, default='shuffled_pre_ranking.json',
                         help='Path to pre-ranking JSON file')
     parser.add_argument('--queries_list', type=str, default='test_queries.json',
                         help='Path to training queries JSON file')
     parser.add_argument('--text_type', type=str, default='TA',
+                        choices=['TA', 'claims', 'description', 'full', 'tac1', 'smart', 'smart2'],
                         help='Type of text to use for scoring')
     parser.add_argument('--model_name', type=str, default='intfloat/e5-large-v2',
                         help='Name of the cross-encoder model')
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                         help='Device to use (cuda/cpu)')
     parser.add_argument('--base_dir', type=str,
+                        default=f'{base_directory}/datasets',
                         help='Base directory for data files')
     args = parser.parse_args()