File size: 6,885 Bytes
b31ccfa 10b9217 b31ccfa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import torch
import os
from pathlib import Path
from matchms import Spectrum
from typing import List, Optional, Literal
# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())
def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
from matchms.filtering import select_by_intensity, \
normalize_intensities, \
select_by_relative_intensity, \
reduce_to_number_of_peaks, \
select_by_mz, \
require_minimum_number_of_peaks
def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
"""
One of the many ways to preprocess the spectrum - we use this by default.
"""
spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
spectrum = normalize_intensities(spectrum)
spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
return spectrum
spectra = list(process_spectrum(s) for s in spectra) # Some might be None
return spectra
def run(r_filepath:Path, q_filepath:Path,
tolerance: float = 0.1,
mz_power: float = 0.0,
intensity_power: float = 1.0,
shift: float = 0,
batch_size: int = 2048,
n_max_peaks: int = 1024,
match_limit: int = 2048,
array_type: Literal['sparse','numpy'] = "numpy",
sparse_threshold: float = .75):
print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
# debug = os.getenv('CUDAMS_DEBUG') == '1'
# if debug:
# r_filepath = Path('tests/data/pesticides.mgf')
# q_filepath = Path('tests/data/pesticides.mgf')
assert r_filepath is not None, "Reference file is missing."
assert q_filepath is not None, "Query file is missing."
import tempfile
import numpy as np
from cudams.similarity import CudaCosineGreedy
from matchms.importing import load_from_mgf
from matchms import calculate_scores
import matplotlib.pyplot as plt
refs = preprocess_spectra(list(load_from_mgf(str(r_filepath))))
ques = preprocess_spectra(list(load_from_mgf(str(q_filepath))))
# If we have small spectra, don't make a huge batch
if batch_size > max(len(refs), len(ques)):
batch_size = max(len(refs), len(ques))
scores_obj = calculate_scores(
refs, ques,
similarity_function=CudaCosineGreedy(
tolerance=tolerance,
mz_power=mz_power,
intensity_power=intensity_power,
shift=shift,
batch_size=batch_size,
n_max_peaks=n_max_peaks,
match_limit=match_limit,
sparse_threshold=sparse_threshold
),
array_type=array_type
)
score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
fig, axs = plt.subplots(1, 2,
figsize=(10, 5),
dpi=150)
scores = scores_obj.to_array()
ax = axs[0]
ax.imshow(scores['CudaCosineGreedy_score'])
ax = axs[1]
ax.imshow(scores['CudaCosineGreedy_matches'])
plt.suptitle("Score and matches")
plt.savefig(score_vis.name)
score = tempfile.NamedTemporaryFile(suffix='.npz', delete=False)
np.savez(score.name, scores=scores)
import pickle
pickle_ = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
return score.name, score_vis.name, pickle_.name
with gr.Blocks() as demo:
gr.Markdown("Run Cuda Cosine Greedy on your MGF files.")
with gr.Row():
refs = gr.File(label="Upload REFERENCES.mgf",
interactive=True,
value='tests/data/pesticides.mgf')
ques = gr.File(label="Upload QUERIES.mgf",
interactive=True,
value='tests/data/pesticides.mgf')
with gr.Row():
tolerance = gr.Slider(minimum=0, maximum=1, value=0.1, label="Tolerance")
mz_power = gr.Slider(minimum=0, maximum=2, value=0.0, label="mz_power")
intensity_power = gr.Slider(minimum=0, maximum=2, value=1.0, label="Intensity Power")
shift = gr.Slider(minimum=-10, maximum=10, value=0, label="Shift")
with gr.Row():
batch_size = gr.Number(value=2048, label="Batch Size", info='How many spectra to process pairwise, in one step. Limited by GPU size, default works well for the T4 GPU.')
n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
info="Some spectra are too large to fit on GPU,"
"so we have to trim them to only use the first "
"n_max_peaks number of peaks.")
match_limit = gr.Number(value=2048, label="Match Limit",
info="Two very similar spectra of size N and M can have N * M matches, before filtering."
"This doesn't fit on GPU, so we stop accumulating more matches once we have at most match_limit number of them."
"In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
with gr.Row():
array_type = gr.Radio(['numpy', 'sparse'], value='numpy', type='value',
label='How to handle outputs - if sparse, everything with score less than sparse_threshold will be discarded. If `numpy`, we disable sparse behaviour.')
sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
"While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
"results as a SparseStack format."
)
with gr.Row():
score_vis = gr.Image()
with gr.Row():
out_npz = gr.File(label="Download similarity matrix (.npz format)",
interactive=False)
out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
interactive=False)
btn = gr.Button("Run")
btn.click(fn=run, inputs=[refs, ques, tolerance, mz_power, intensity_power, shift,
batch_size, n_max_peaks, match_limit,
array_type, sparse_threshold], outputs=[out_npz, score_vis, out_pickle])
if __name__ == "__main__":
demo.launch(debug=True) |