Spaces:

ksg-dfci
/

trial_search_alpha

Running on CPU Upgrade

File size: 8,703 Bytes

787eac9
 
 
 
c3ce722
787eac9
 
 
 
 
 
 
 
6627d0a
787eac9
 
b4d552d
787eac9
 
 
 
c3ce722
 
 
 
 
 
 
 
 
 
fd5440d
a8557d1
c3ce722
 
fd5440d
c3ce722
 
 
 
 
 
 
 
 
 
 
 
364bf23
787eac9
364bf23
 
5890666
364bf23
c3ce722
787eac9
c3ce722
364bf23
 
787eac9
 
 
 
 
 
 
364bf23
 
787eac9
364bf23
 
 
787eac9
364bf23
db84ee0
d1e19bb
364bf23
 
d1e19bb
 
364bf23
 
 
787eac9
 
364bf23
c3ce722
 
d1d3fa0
c3ce722
d1e19bb
3a245c8
364bf23
 
 
 
99999b9
364bf23
3a245c8
787eac9
d3a36a2
c3ce722
364bf23
c3ce722
 
364bf23
 
c3ce722
 
 
db84ee0
c3ce722
 
 
 
 
0bf83d7
364bf23
0bf83d7
c3ce722
8a74c5f
0bf83d7
a8557d1
364bf23
 
c3ce722
 
 
 
 
 
364bf23
c3ce722
364bf23
 
c3ce722
364bf23
 
 
 
 
d1d3fa0
b43564d
c3ce722
 
364bf23
 
 
3815070
364bf23
8a74c5f
364bf23
3815070
 
 
 
c3ce722
787eac9
 
 
 
 
 
 
 
364bf23
89fab02
b6071fa
 
c3ce722
 
 
1aa4ab6
c3ce722
67f9a7a
 
 
89fab02
dc47a5a
a8557d1
3815070
364bf23
3815070
cd77266
3815070
364bf23
c3ce722
 
 
 
 
 
a8557d1
787eac9
6627d0a
a8557d1
3815070
6627d0a
8a74c5f
364bf23
 
db84ee0
 
c86b2dd
 
 
a8557d1
c86b2dd
 
 
 
 
 
db84ee0
c86b2dd
 
 
8a74c5f
c86b2dd
 
c3ce722
c86b2dd
 
 
8a74c5f
c86b2dd
 
 
 
 
 
db84ee0
c86b2dd
 
 
 
 
 
8a74c5f
c86b2dd

import gradio as gr
import pandas as pd
import torch
import torch.nn.functional as F
import tempfile
from sentence_transformers import SentenceTransformer
from safetensors import safe_open
from transformers import pipeline, AutoTokenizer

# Load trial spaces data
trial_spaces = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

# Load embedding model
embedding_model = SentenceTransformer('ksg-dfci/TrialSpace', trust_remote_code=True)

# Load precomputed trial space embeddings
with safe_open("trial_space_embeddings.safetensors", framework="pt") as f:
    trial_space_embeddings = f.get_tensor("space_embeddings")

# Load checker pipeline
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
checker_pipe = pipeline(
    'text-classification', 
    'ksg-dfci/TrialChecker', 
    tokenizer=tokenizer, 
    truncation=True, 
    padding='max_length', 
    max_length=512
)

def match_clinical_trials_dropdown(patient_summary: str, max_results_str: str):
    """

    1) Runs the trial matching logic.

    2) Returns a Dropdown (with the matched trials) and a DataFrame (for further use).

    3) The user-supplied max_results_str is converted to an int (1-50).

    """
    # Parse the max_results input
    try:
        max_results = int(max_results_str)
    except ValueError:
        max_results = 10  # if invalid input, default to 10

    # Clamp within [1, 50]
    if max_results < 1:
        max_results = 1
    elif max_results > 50:
        max_results = 50

    # 1. Encode user input
    patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)

    # 2. Compute similarities
    similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)

    # 3. Pull top 'max_results'
    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
    top_indices = sorted_indices[:max_results].cpu().numpy()

    # 4. Build DataFrame
    relevant_spaces = trial_spaces.iloc[top_indices].this_space
    relevant_nctid = trial_spaces.iloc[top_indices].nct_id
    relevant_title = trial_spaces.iloc[top_indices].title
    relevant_brief_summary = trial_spaces.iloc[top_indices].brief_summary
    relevant_eligibility_criteria = trial_spaces.iloc[top_indices].eligibility_criteria

    analysis = pd.DataFrame({
        'patient_summary_query': patient_summary,
        'nct_id': relevant_nctid,
        'trial_title': relevant_title,
        'trial_brief_summary': relevant_brief_summary,
        'trial_eligibility_criteria': relevant_eligibility_criteria,
        'this_space': relevant_spaces,
    }).reset_index(drop=True)

    # 5. Prepare for checker pipeline
    analysis['pt_trial_pair'] = (
        analysis['this_space']
        + "\nNow here is the patient summary:"
        + analysis['patient_summary_query']
    )

    # 6. Run checker pipeline
    classifier_results = checker_pipe(analysis['pt_trial_pair'].tolist())
    analysis['trial_checker_result'] = [x['label'] for x in classifier_results]
    analysis['trial_checker_score'] = [x['score'] for x in classifier_results]

    # 7. Restrict to POSITIVE results only
    analysis = analysis[analysis.trial_checker_result == 'POSITIVE'].reset_index(drop=True)

    # 8. Final columns
    out_df = analysis[[
        'patient_summary_query',
        'nct_id',
        'trial_title',
        'trial_brief_summary',
        'trial_eligibility_criteria',
        'this_space',
        'trial_checker_result',
        'trial_checker_score'
    ]]

    # Build the dropdown choices, e.g., "1. NCT001 - Some Title"
    dropdown_options = []
    for i, row in out_df.iterrows():
        option_str = f"{i+1}. {row['nct_id']} - {row['trial_title']}"
        dropdown_options.append(option_str)

    # If we have no results, keep the dropdown empty
    if len(dropdown_options) == 0:
        return gr.Dropdown(choices=[], interactive=True, value=None), out_df

    # Otherwise, pick the first item as the default
    return (
        gr.Dropdown(choices=dropdown_options, interactive=True, value=dropdown_options[0]), 
        out_df
    )

def show_selected_trial(selected_option: str, df: pd.DataFrame):
    """

    1) Given the selected dropdown option, e.g. "1. NCT001 - Some Title"

    2) Find the row in df and build a summary string.

    """
    if not selected_option:
        return ""

    # Parse the index from "1. NCT001 - Some Title"
    chosen_index_str = selected_option.split(".")[0].strip()
    try:
        chosen_index = int(chosen_index_str) - 1
    except ValueError:
        return "No data found for the selected trial."

    if chosen_index < 0 or chosen_index >= len(df):
        return "No data found for the selected trial."

    record = df.iloc[chosen_index].to_dict()
    details = (
        f"Patient Summary Query: {record['patient_summary_query']}\n\n"
        f"NCT ID: {record['nct_id']}\n"
        f"Trial Title: {record['trial_title']}\n\n"
        f"Trial Space: {record['this_space']}\n\n"
        f"Trial Checker Result: {record['trial_checker_result']}\n"
        f"Trial Checker Score: {record['trial_checker_score']}\n\n"
        f"Brief Summary: {record['trial_brief_summary']}\n\n"
        f"Full Eligibility Criteria: {record['trial_eligibility_criteria']}\n\n"
    )
    return details

def export_results(df: pd.DataFrame):
    """

    Saves the DataFrame to a temporary CSV file so Gradio can provide a downloadable link.

    """
    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    df.to_csv(temp.name, index=False)
    return temp.name

# A little CSS for the input boxes
custom_css = """

#input_box textarea {

    width: 600px !important;

    height: 250px !important;

}

"""

with gr.Blocks(css=custom_css) as demo:
    # Intro text
    gr.HTML("""

    <h3>Demonstration version of clinical trial search based on MatchMiner-AI</h3>

    <p>Based on clinicaltrials.gov cancer trials export 10/31/24.</p>

    <p>Queries take approximately 30 seconds to run per ten results returned, 

       since demo is running on a small CPU instance.</p>

    <p>Disclaimers:</p>

    <p>1. Not a clinical decision support tool. Queries are not saved, but do not input protected health information.</p>

    <p>2. AI-extracted trial "spaces" and candidate matches may contain errors</p>

    <p>3. Will not necessarily return all trials from clinicaltrials.gov that match a given query</p>

    <p>4. Under active development; interface and underlying models will change</p>

    <p>5. For better results, spell out cancer types (eg, enter "acute myeloid leukemia" rather than "AML")

    """)

    # Textbox for patient summary
    patient_summary_input = gr.Textbox(
        label="Enter Patient Summary",
        elem_id="input_box",
        value="Cancer type: Non-small cell lung cancer.  Histology: Adenocarcinoma.  Extent of disease: Metastatic.  Prior treatment: Pembrolizumab.  Biomarkers: PD-L1 high, KRAS G12C mutant."
    )

    # Textbox for max results
    max_results_input = gr.Textbox(
        label="Enter the maximum number of results to return (1-50)",
        value="10"  # default
    )

    # Button to run the matching
    submit_btn = gr.Button("Find Matches")

    # We'll store the DataFrame in a State for CSV export + reference
    results_state = gr.State()

    # Dropdown (initially empty)
    trial_dropdown = gr.Dropdown(
        label="Select a Trial",
        choices=[],
        value=None,
        interactive=True
    )

    # Textbox for showing details of the selected trial
    trial_details_box = gr.Textbox(
        label="Selected Trial Details",
        lines=12,
        interactive=False
    )

    # Export button + file
    export_btn = gr.Button("Export Results")
    download_file = gr.File()

    # 1) "Find Matches" => updates the dropdown choices and the state
    submit_btn.click(
        fn=match_clinical_trials_dropdown,
        inputs=[patient_summary_input, max_results_input],
        outputs=[trial_dropdown, results_state]
    )

    # 2) Selecting from the dropdown => shows more info
    trial_dropdown.change(
        fn=show_selected_trial,
        inputs=[trial_dropdown, results_state],
        outputs=trial_details_box
    )

    # 3) Export => CSV
    export_btn.click(
        fn=export_results,
        inputs=results_state,
        outputs=download_file
    )

    # Enable queue so "Processing..." is shown if logic is slow
    demo.queue()

if __name__ == "__main__":
    demo.launch()