import os import torch import numpy as np import gradio as gr import zipfile import json import requests import subprocess import shutil from transformers import BlipProcessor, BlipForConditionalGeneration title = "# 🗜️ CLaMP 3 - Multimodal & Multilingual Semantic Music Search" badges = """
Homepage Paper GitHub Demo Model Weights Dataset Benchmark
""" description = """CLaMP 3 is a **multimodal and multilingual** music information retrieval (MIR) framework, supporting **sheet music, audio, and performance signals** in **100 languages**. Using **contrastive learning**, it aligns these modalities in a shared space for **cross-modal retrieval**. ### 🔍 **How This Demo Works** - You can **retrieve music using any text input (in any language) or an image** (`.png`, `.jpg`). - When using an image, **BLIP** generates a caption, which is then used for retrieval. - Since CLaMP 3's training data includes **rich visual descriptions of musical scenes**, it can **match images to semantically relevant music**. - For simplicity, this demo retrieves music based on **metadata (text descriptions)** rather than directly searching sheet music, MIDI, or audio files. ### ⚠️ **Limitations** - This demo retrieves music **only from the WikiMT-X benchmark (1,000 pieces)**. - These pieces are **mainly from the U.S. and Western Europe (especially the U.S.)** and **mostly from the 20th century**. - Thus, retrieval results are **mostly limited to Western 20th-century music**, so you **won’t** find music from **other regions or historical periods**. 🔧 **Need retrieval for a different music collection?** Deploy **[CLaMP 3](https://github.com/sanderwood/clamp3)** on your own dataset. Generally, the larger and more diverse the reference music dataset, the better the retrieval quality, increasing the likelihood of finding relevant and accurately matched music. **Note: This project is for research use only.** """ # Load BLIP image captioning model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Download weight file if it does not exist weights_url = "https://huggingface.co/sander-wood/clamp3/resolve/main/weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth" weights_filename = "weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth" if not os.path.exists(weights_filename): print("Downloading weights file...") response = requests.get(weights_url, stream=True) response.raise_for_status() with open(weights_filename, "wb") as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) print("Weights file downloaded.") ZIP_PATH = "features.zip" if os.path.exists(ZIP_PATH): print(f"Extracting {ZIP_PATH}...") with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref: zip_ref.extractall(".") print("Extraction complete.") # Load metadata metadata_map = {} METADATA_FILE = "wikimt-x-public.jsonl" if os.path.exists(METADATA_FILE): with open(METADATA_FILE, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) metadata_map[data["id"]] = data else: print(f"Warning: {METADATA_FILE} not found.") features_cache = {} def get_info(folder_path): """ Load all .npy files from the specified folder and return a dictionary with the file names (without extension) as keys. """ if folder_path in features_cache: return features_cache[folder_path] if not os.path.exists(folder_path): return {} files = sorted(os.listdir(folder_path)) features = {} for file in files: if file.endswith(".npy"): key = file.split(".")[0] try: features[key] = np.load(os.path.join(folder_path, file))[0] except Exception as e: print(f"Error loading {file}: {e}") features_cache[folder_path] = features return features def find_top_similar(query_file, reference_folder): """ Compare the query feature with all reference features in the specified folder using cosine similarity and return the top 10 candidate results in the format: Title | Artists | sim: SimilarityScore. """ top_k = 10 try: query_feature = np.load(query_file.name)[0] except Exception as e: return [], f"Error loading query feature: {e}" query_tensor = torch.tensor(query_feature, dtype=torch.float32).unsqueeze(dim=0) key_features = get_info(reference_folder) if not key_features: return [], f"No reference features found in {reference_folder}." ref_keys = list(key_features.keys()) ref_array = np.array([key_features[k] for k in ref_keys]) key_feats_tensor = torch.tensor(ref_array, dtype=torch.float32) query_tensor_expanded = query_tensor.expand(key_feats_tensor.size(0), -1) similarities = torch.cosine_similarity(query_tensor_expanded, key_feats_tensor, dim=1) ranked_indices = torch.argsort(similarities, descending=True) candidate_ids = [] candidate_display = [] for i in range(top_k): if i < len(ref_keys): candidate_idx = ranked_indices[i].item() candidate_id = ref_keys[candidate_idx] sim = round(similarities[candidate_idx].item(), 4) meta = metadata_map.get(candidate_id, {}) title = meta.get("title", candidate_id) artists = meta.get("artists", "Unknown") if isinstance(artists, list): artists = ", ".join(artists) candidate_ids.append(candidate_id) candidate_display.append(f"{title} | {artists} | sim: {sim}") else: candidate_ids.append("N/A") candidate_display.append("N/A") return candidate_ids, candidate_display def show_details(selected_id): """ Return detailed metadata and embedded YouTube video HTML based on the candidate ID. """ if selected_id == "N/A": return ("", "", "", "", "", "", "", "") data = metadata_map.get(selected_id, {}) if not data: return ("No details found", "", "", "", "", "", "", "") title = data.get("title", "") artists = data.get("artists", "") if isinstance(artists, list): artists = ", ".join(artists) genre = data.get("genre", "") background = data.get("background", "") analysis = data.get("analysis", "") description = data.get("description", "") scene = data.get("scene", "") youtube_html = ( f'' ) return title, artists, genre, background, analysis, description, scene, youtube_html def extract_features_from_text(text): """ Save the input text to a file, call the CLaMP 3 feature extraction script, and return the generated feature file path. """ input_dir = "input_dir" output_dir = "output_dir" os.makedirs(input_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) # Clear input_dir and output_dir for d in [input_dir, output_dir]: for filename in os.listdir(d): file_path = os.path.join(d, filename) if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) input_file = os.path.join(input_dir, "input.txt") print("Text input:", text) with open(input_file, "w", encoding="utf-8") as f: f.write(text) command = ["python", "extract_clamp3.py", input_dir, output_dir, "--get_global"] subprocess.run(command, check=True) output_file = os.path.join(output_dir, "input.npy") return output_file def generate_caption(image): """ Use the BLIP model to generate a descriptive caption for the given image. """ inputs = processor(image, return_tensors="pt") outputs = blip_model.generate(**inputs) caption = processor.decode(outputs[0], skip_special_tokens=True) return caption class FileWrapper: """ Simulate a file object with a .name attribute. """ def __init__(self, path): self.name = path def search_wrapper(search_mode, text_input, image_input): """ Perform retrieval based on the selected input mode: - If search_mode is "Image", use the uploaded image to generate a caption, then extract features and search in the "image/" folder. - If search_mode is "Text", use the provided text to extract features and search in the "image/" folder. """ if search_mode == "Image": if image_input is None: return text_input, gr.update(choices=[]), "Please upload an image.", "", "", "", "", "", "", "" caption = generate_caption(image_input) text_to_use = caption reference_folder = "image/" elif search_mode == "Text": if not text_input or text_input.strip() == "": return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Please enter text for retrieval.", "", "", "", "", "", "", "" text_to_use = text_input reference_folder = "text/" else: return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Invalid search mode selected.", "", "", "", "", "", "", "" try: output_file = extract_features_from_text(text_to_use) query_file = FileWrapper(output_file) except Exception as e: return text_to_use, gr.update(choices=[]), f"Error during feature extraction: {e}", "", "", "", "", "", "", "" candidate_ids, candidate_display = find_top_similar(query_file, reference_folder) if not candidate_ids: return text_to_use, gr.update(choices=[]), "", "", "", "", "", "", "", "" choices = [(f"{i+1}. {disp}", cid) for i, (cid, disp) in enumerate(zip(candidate_ids, candidate_display))] top_candidate = candidate_ids[0] details = show_details(top_candidate) return text_to_use, gr.update(choices=choices), *details # 定义示例数据(示例数据放在组件定义之后也可以正常运行) examples = [ ["Image", None, "V4EauuhVEw4.jpg"], ["Image", None, "Kw-_Ew5bVxs.jpg"], ["Image", None, "BuYf0taXoNw.webp"], ["Image", None, "4tDYMayp6Dk.jpg"], ["Text", "classic rock, British, 1960s, upbeat", None], ["Text", "A Latin jazz piece with rhythmic percussion and brass", None], ["Text", "big band, major key, swing, brass-heavy, syncopation, baritone vocal", None], ["Text", "Heartfelt and nostalgic, with a bittersweet, melancholic feel", None], ["Text", "Melodía instrumental en re mayor con progresión armónica repetitiva y fluida", None], ["Text", "D大调四四拍的爱尔兰舞曲", None], ["Text", "Ιερή μουσική με πνευματική ατμόσφαιρα", None], ["Text", "የፍቅር ሙዚቃ ሞቅ እና ስሜታማ ከሆነ ነገር ግን ድንቅ እና አስደሳች ቃላት ያካትታል", None], ] with gr.Blocks() as demo: gr.Markdown(title) gr.HTML(badges) gr.Markdown(description) with gr.Row(): with gr.Column(): search_mode = gr.Radio( choices=["Text", "Image"], label="Select Search Mode", value="Text", interactive=True, elem_classes=["vertical-radio"] ) text_input = gr.Textbox( placeholder="Describe the music you're looking for (in any language)", lines=4 ) image_input = gr.Image( label="Or upload an image (PNG, JPG)", type="pil" ) search_button = gr.Button("Search from 1,000 Western 20th-century music in WikiMT-X") candidate_radio = gr.Radio(choices=[], label="Select Retrieval Result", interactive=True, elem_classes=["vertical-radio"]) with gr.Column(): gr.Markdown("### YouTube Video") youtube_box = gr.HTML(label="YouTube Video") gr.Markdown("### Metadata") title_box = gr.Textbox(label="Title", interactive=False) artists_box = gr.Textbox(label="Artists", interactive=False) genre_box = gr.Textbox(label="Genre", interactive=False) background_box = gr.Textbox(label="Background", interactive=False) analysis_box = gr.Textbox(label="Analysis", interactive=False) description_box = gr.Textbox(label="Description", interactive=False) scene_box = gr.Textbox(label="Scene", interactive=False) gr.HTML( """ """ ) gr.Examples( examples=examples, inputs=[search_mode, text_input, image_input], outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box], fn=search_wrapper, cache_examples=False, ) search_button.click( fn=search_wrapper, inputs=[search_mode, text_input, image_input], outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box] ) candidate_radio.change( fn=show_details, inputs=candidate_radio, outputs=[title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box] ) demo.launch()