File size: 6,885 Bytes
b31ccfa
 
 
 
 
 
 
 
10b9217
b31ccfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import torch
import os
from pathlib import Path
from matchms import Spectrum
from typing import List, Optional, Literal
# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())

def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
    from matchms.filtering import select_by_intensity, \
        normalize_intensities, \
        select_by_relative_intensity, \
        reduce_to_number_of_peaks, \
        select_by_mz, \
        require_minimum_number_of_peaks
    
    def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
        """
        One of the many ways to preprocess the spectrum - we use this by default.
        """
        spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
        spectrum = normalize_intensities(spectrum)
        spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
        spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
        spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
        return spectrum
    
    spectra = list(process_spectrum(s) for s in spectra) # Some might be None
    return spectra

def run(r_filepath:Path, q_filepath:Path,
        tolerance: float = 0.1,
        mz_power: float = 0.0,
        intensity_power: float = 1.0,
        shift: float = 0,
        batch_size: int = 2048,
        n_max_peaks: int = 1024,
        match_limit: int = 2048,
        array_type: Literal['sparse','numpy'] = "numpy",
        sparse_threshold: float = .75):
    print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
    # debug = os.getenv('CUDAMS_DEBUG') == '1'
    # if debug:
    #     r_filepath = Path('tests/data/pesticides.mgf')
    #     q_filepath = Path('tests/data/pesticides.mgf')

    assert r_filepath is not None, "Reference file is missing."
    assert q_filepath is not None, "Query file is missing."
    import tempfile
    import numpy as np
    from cudams.similarity import CudaCosineGreedy
    from matchms.importing import load_from_mgf
    from matchms import calculate_scores
    import matplotlib.pyplot as plt

    refs = preprocess_spectra(list(load_from_mgf(str(r_filepath))))
    ques = preprocess_spectra(list(load_from_mgf(str(q_filepath))))

    # If we have small spectra, don't make a huge batch
    if batch_size > max(len(refs), len(ques)):
         batch_size = max(len(refs), len(ques))

    scores_obj = calculate_scores(
        refs, ques, 
        similarity_function=CudaCosineGreedy(
            tolerance=tolerance,
            mz_power=mz_power,
            intensity_power=intensity_power,
            shift=shift,
            batch_size=batch_size,
            n_max_peaks=n_max_peaks,
            match_limit=match_limit,
            sparse_threshold=sparse_threshold
        ),
        array_type=array_type
    )

    score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)

    fig, axs = plt.subplots(1, 2,
                            figsize=(10, 5), 
                            dpi=150)
    
    scores = scores_obj.to_array()
    ax = axs[0]
    ax.imshow(scores['CudaCosineGreedy_score'])

    ax = axs[1]
    ax.imshow(scores['CudaCosineGreedy_matches'])

    plt.suptitle("Score and matches")
    plt.savefig(score_vis.name)

    score = tempfile.NamedTemporaryFile(suffix='.npz', delete=False)
    np.savez(score.name, scores=scores)


    import pickle
    pickle_ = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)

    Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
    return score.name,  score_vis.name, pickle_.name

with gr.Blocks() as demo:
    gr.Markdown("Run Cuda Cosine Greedy on your MGF files.")
    with gr.Row():
        refs = gr.File(label="Upload REFERENCES.mgf",
                       interactive=True,
                               value='tests/data/pesticides.mgf')
        ques = gr.File(label="Upload QUERIES.mgf",
                       interactive=True,
                               value='tests/data/pesticides.mgf')
    with gr.Row():
            tolerance = gr.Slider(minimum=0, maximum=1, value=0.1, label="Tolerance")
            mz_power = gr.Slider(minimum=0, maximum=2, value=0.0, label="mz_power")
            intensity_power = gr.Slider(minimum=0, maximum=2, value=1.0, label="Intensity Power")
            shift = gr.Slider(minimum=-10, maximum=10, value=0, label="Shift")
    with gr.Row():
            batch_size = gr.Number(value=2048, label="Batch Size", info='How many spectra to process pairwise, in one step. Limited by GPU size, default works well for the T4 GPU.')
            n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks", 
                                    info="Some spectra are too large to fit on GPU,"
                                        "so we have to trim them to only use the first "
                                        "n_max_peaks number of peaks.")
            match_limit = gr.Number(value=2048, label="Match Limit", 
                                    info="Two very similar spectra of size N and M can have N * M matches, before filtering."
                                         "This doesn't fit on GPU, so we stop accumulating more matches once we have at most match_limit number of them."
                                         "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
    with gr.Row():
            array_type = gr.Radio(['numpy', 'sparse'], value='numpy', type='value',
                                     label='How to handle outputs - if sparse, everything with score less than sparse_threshold will be discarded. If `numpy`, we disable sparse behaviour.')
            sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
                                         info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
                                            "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
                                            "results as a SparseStack format."
                                            )
    with gr.Row():
        score_vis = gr.Image()

    with gr.Row():
        out_npz = gr.File(label="Download similarity matrix (.npz format)", 
                      interactive=False)
        out_pickle = gr.File(label="Download full `Scores` object (.pickle format)", 
                      interactive=False)
    btn = gr.Button("Run")
    btn.click(fn=run, inputs=[refs, ques, tolerance, mz_power, intensity_power, shift, 
                              batch_size, n_max_peaks, match_limit, 
                              array_type, sparse_threshold], outputs=[out_npz, score_vis, out_pickle])

if __name__ == "__main__":
    demo.launch(debug=True)