shayan5422 commited on
Commit
b7fc810
·
verified ·
1 Parent(s): 8181a7b

Upload 1366 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. index.faiss +2 -2
  2. index_to_metadata.pkl +2 -2
  3. model_data_json/Alibaba-NLP_gte-modernbert-base.json +26 -0
  4. model_data_json/BAAI_bge-multilingual-gemma2.json +0 -0
  5. model_data_json/BAAI_llm-embedder.json +19 -0
  6. model_data_json/EleutherAI_pythia-1b.json +25 -0
  7. model_data_json/FacebookAI_xlm-roberta-large-finetuned-conll03-english.json +115 -0
  8. model_data_json/Helsinki-NLP_opus-mt-en-da.json +20 -0
  9. model_data_json/HuggingFaceTB_SmolLM2-1.7B-Instruct.json +25 -0
  10. model_data_json/HuggingFaceTB_SmolVLM-Instruct.json +23 -0
  11. model_data_json/HuggingFaceTB_SmolVLM2-2.2B-Instruct.json +33 -0
  12. model_data_json/KBLab_kb-whisper-small.json +21 -0
  13. model_data_json/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json +21 -0
  14. model_data_json/LorenzoDeMattei_GePpeTto.json +20 -0
  15. model_data_json/M-CLIP_XLM-Roberta-Large-Vit-B-16Plus.json +61 -0
  16. model_data_json/MahmoodLab_TITAN.json +22 -0
  17. model_data_json/MoritzLaurer_bge-m3-zeroshot-v2.0.json +22 -0
  18. model_data_json/NbAiLab_nb-wav2vec2-300m-nynorsk.json +21 -0
  19. model_data_json/OpenGVLab_InternVL2_5-38B-MPO.json +27 -0
  20. model_data_json/OpenGVLab_InternVL3-2B.json +28 -0
  21. model_data_json/QuantFactory_Meta-Llama-3.1-8B-Instruct-GGUF.json +28 -0
  22. model_data_json/Qwen_QwQ-32B-GGUF.json +20 -0
  23. model_data_json/Qwen_Qwen2-VL-2B.json +21 -0
  24. model_data_json/Qwen_Qwen2-VL-72B.json +21 -0
  25. model_data_json/Qwen_Qwen2.5-32B-Instruct-AWQ.json +26 -0
  26. model_data_json/Qwen_Qwen2.5-32B.json +16 -0
  27. model_data_json/Qwen_Qwen2.5-Math-7B-Instruct.json +23 -0
  28. model_data_json/Qwen_Qwen2.5-VL-7B-Instruct-AWQ.json +26 -0
  29. model_data_json/Qwen_Qwen3-32B.json +18 -0
  30. model_data_json/Qwen_Qwen3-4B.json +20 -0
  31. model_data_json/Qwen_Qwen3-8B.json +20 -0
  32. model_data_json/RedHatAI_Mistral-Small-24B-Instruct-2501-FP8-dynamic.json +25 -0
  33. model_data_json/Revai_reverb-diarization-v1.json +15 -0
  34. model_data_json/Rostlab_prot_t5_xl_uniref50.json +18 -0
  35. model_data_json/Snowflake_snowflake-arctic-embed-m-v2.0.json +0 -0
  36. model_data_json/THUDM_glm-4-voice-tokenizer.json +11 -0
  37. model_data_json/TaylorAI_gte-tiny.json +22 -0
  38. model_data_json/TheBloke_Llama-2-7B-Chat-GGUF.json +22 -0
  39. model_data_json/TheBloke_Mixtral-8x7B-Instruct-v0.1-AWQ.json +26 -0
  40. model_data_json/UBC-NLP_MARBERT.json +22 -0
  41. model_data_json/ZhenYe234_hubert_base_general_audio.json +13 -0
  42. model_data_json/aglazkova_bart_finetuned_keyphrase_extraction.json +22 -0
  43. model_data_json/aipicasso_emi.json +18 -0
  44. model_data_json/allegro_herbert-base-cased.json +19 -0
  45. model_data_json/answerdotai_ModernBERT-large.json +21 -0
  46. model_data_json/ashawkey_mvdream-sd2.1-diffusers.json +14 -0
  47. model_data_json/aubmindlab_bert-base-arabert.json +20 -0
  48. model_data_json/autogluon_chronos-bolt-tiny.json +21 -0
  49. model_data_json/bartowski_Llama-3.2-3B-Instruct-GGUF.json +28 -0
  50. model_data_json/benjamin-paine_stable-diffusion-v1-5-inpainting.json +21 -0
index.faiss CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78a068ac98a5de614955c9c1e307b40f7b403bd46d315cf3b583f22466bf5e7a
3
- size 3545133
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c3f83a20b1e8774d976546f71114c9bd9195f65e2e101a24bf21a31b9945b0c
3
+ size 4187181
index_to_metadata.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe076a7a46265654075c23b846b259d7beaef163f5a5c72d14246a0e3d73579f
3
- size 530243
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:febfb797520f91f6872504c1bdcbe58d5446caf4da8e9bdcd38c99bab4882e26
3
+ size 689653
model_data_json/Alibaba-NLP_gte-modernbert-base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Alibaba-NLP/gte-modernbert-base",
3
+ "downloads": 69419,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "onnx",
8
+ "safetensors",
9
+ "modernbert",
10
+ "feature-extraction",
11
+ "sentence-transformers",
12
+ "mteb",
13
+ "embedding",
14
+ "transformers.js",
15
+ "sentence-similarity",
16
+ "en",
17
+ "arxiv:2308.03281",
18
+ "base_model:answerdotai/ModernBERT-base",
19
+ "base_model:finetune:answerdotai/ModernBERT-base",
20
+ "license:apache-2.0",
21
+ "endpoints_compatible",
22
+ "region:us"
23
+ ],
24
+ "description": "--- license: apache-2.0 language: - en base_model: - answerdotai/ModernBERT-base base_model_relation: finetune pipeline_tag: sentence-similarity library_name: transformers tags: - sentence-transformers - mteb - embedding - transformers.js --- # gte-modernbert-base We are excited to introduce the series of models, which are built upon the latest modernBERT pre-trained encoder-only foundation models. The series models include both text embedding models and rerank models. The models demonstrates competitive performance in several text embedding and text retrieval evaluation tasks when compared to similar-scale models from the current open-source community. This includes assessments such as MTEB, LoCO, and COIR evaluation. ## Model Overview - Developed by: Tongyi Lab, Alibaba Group - Model Type: Text Embedding - Primary Language: English - Model Size: 149M - Max Input Length: 8192 tokens - Output Dimension: 768 ### Model list | Models | Language | Model Type | Model Size | Max Seq. Length | Dimension | MTEB-en | BEIR | LoCo | CoIR | |:--------------------------------------------------------------------------------------:|:--------:|:----------------------:|:----------:|:---------------:|:---------:|:-------:|:----:|:----:|:----:| | []( | English | text embedding | 149M | 8192 | 768 | 64.38 | 55.33 | 87.57 | 79.31 | | []( | English | text reranker | 149M | 8192 | - | - | 56.19 | 90.68 | 79.99 | ## Usage > [!TIP] > For and , if your GPU supports it, the efficient Flash Attention 2 will be used automatically if you have installed. It is not mandatory. > > Use with Use with : Use with : ## Training Details The series of models follows the training scheme of the previous GTE models, with the only difference being that the pre-training language model base has been replaced from GTE-MLM to ModernBert. For more training details, please refer to our paper: mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval ## Evaluation ### MTEB The results of other models are retrieved from MTEB leaderboard. Given that all models in the series have a size of less than 1B parameters, we focused exclusively on the results of models under 1B from the MTEB leaderboard. | Model Name | Param Size (M) | Dimension | Sequence Length | Average (56) | Class. (12) | Clust. (11) | Pair Class. (3) | Reran. (4) | Retr. (15) | STS (10) | Summ. (1) | |:------------------------------------------------------------------------------------------------:|:--------------:|:---------:|:---------------:|:------------:|:-----------:|:---:|:---:|:---:|:---:|:-----------:|:--------:| | mxbai-embed-large-v1 | 335 | 1024 | 512 | 64.68 | 75.64 | 46.71 | 87.2 | 60.11 | 54.39 | 85 | 32.71 | | multilingual-e5-large-instruct | 560 | 1024 | 514 | 64.41 | 77.56 | 47.1 | 86.19 | 58.58 | 52.47 | 84.78 | 30.39 | | bge-large-en-v1.5 | 335 | 1024 | 512 | 64.23 | 75.97 | 46.08 | 87.12 | 60.03 | 54.29 | 83.11 | 31.61 | | gte-base-en-v1.5 | 137 | 768 | 8192 | 64.11 | 77.17 | 46.82 | 85.33 | 57.66 | 54.09 | 81.97 | 31.17 | | bge-base-en-v1.5 | 109 | 768 | 512 | 63.55 | 75.53 | 45.77 | 86.55 | 58.86 | 53.25 | 82.4 | 31.07 | | gte-large-en-v1.5 | 409 | 1024 | 8192 | 65.39 | 77.75 | 47.95 | 84.63 | 58.50 | 57.91 | 81.43 | 30.91 | | modernbert-embed-base | 149 | 768 | 8192 | 62.62 | 74.31 | 44.98 | 83.96 | 56.42 | 52.89 | 81.78 | 31.39 | | nomic-embed-text-v1.5 | | 768 | 8192 | 62.28 | 73.55 | 43.93 | 84.61 | 55.78 | 53.01| 81.94 | 30.4 | | gte-multilingual-base | 305 | 768 | 8192 | 61.4 | 70.89 | 44.31 | 84.24 | 57.47 |51.08 | 82.11 | 30.58 | | jina-embeddings-v3 | 572 | 1024 | 8192 | 65.51 | 82.58 |45.21 |84.01 |58.13 |53.88 | 85.81 | 29.71 | | **gte-modernbert-base** | 149 | 768 | 8192 | **64.38** | **76.99** | **46.47** | **85.93** | **59.24** | **55.33** | **81.57** | **30.68** | ### LoCo (Long Document Retrieval)(NDCG@10) | Model Name | Dimension | Sequence Length | Average (5) | QsmsumRetrieval | SummScreenRetrieval | QasperAbastractRetrieval | QasperTitleRetrieval | GovReportRetrieval | |:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| | gte-qwen1.5-7b | 4096 | 32768 | 87.57 | 49.37 | 93.10 | 99.67 | 97.54 | 98.21 | | gte-large-v1.5 |1024 | 8192 | 86.71 | 44.55 | 92.61 | 99.82 | 97.81 | 98.74 | | gte-base-v1.5 | 768 | 8192 | 87.44 | 49.91 | 91.78 | 99.82 | 97.13 | 98.58 | | gte-modernbert-base | 768 | 8192 | 88.88 | 54.45 | 93.00 | 99.82 | 98.03 | 98.70 | | gte-reranker-modernbert-base | - | 8192 | 90.68 | 70.86 | 94.06 | 99.73 | 99.11 | 89.67 | ### COIR (Code Retrieval Task)(NDCG@10) | Model Name | Dimension | Sequence Length | Average(20) | CodeSearchNet-ccr-go | CodeSearchNet-ccr-java | CodeSearchNet-ccr-javascript | CodeSearchNet-ccr-php | CodeSearchNet-ccr-python | CodeSearchNet-ccr-ruby | CodeSearchNet-go | CodeSearchNet-java | CodeSearchNet-javascript | CodeSearchNet-php | CodeSearchNet-python | CodeSearchNet-ruby | apps | codefeedback-mt | codefeedback-st | codetrans-contest | codetrans-dl | cosqa | stackoverflow-qa | synthetic-text2sql | |:----:|:---:|:---:|:---:|:---:| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | gte-modernbert-base | 768 | 8192 | 79.31 | 94.15 | 93.57 | 94.27 | 91.51 | 93.93 | 90.63 | 88.32 | 83.27 | 76.05 | 85.12 | 88.16 | 77.59 | 57.54 | 82.34 | 85.95 | 71.89 | 35.46 | 43.47 | 91.2 | 61.87 | | gte-reranker-modernbert-base | - | 8192 | 79.99 | 96.43 | 96.88 | 98.32 | 91.81 | 97.7 | 91.96 | 88.81 | 79.71 | 76.27 | 89.39 | 98.37 | 84.11 | 47.57 | 83.37 | 88.91 | 49.66 | 36.36 | 44.37 | 89.58 | 64.21 | ### BEIR(NDCG@10) | Model Name | Dimension | Sequence Length | Average(15) | ArguAna | ClimateFEVER | CQADupstackAndroidRetrieval | DBPedia | FEVER | FiQA2018 | HotpotQA | MSMARCO | NFCorpus | NQ | QuoraRetrieval | SCIDOCS | SciFact | Touche2020 | TRECCOVID | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | gte-modernbert-base | 768 | 8192 | 55.33 | 72.68 | 37.74 | 42.63 | 41.79 | 91.03 | 48.81 | 69.47 | 40.9 | 36.44 | 57.62 | 88.55 | 21.29 | 77.4 | 21.68 | 81.95 | | gte-reranker-modernbert-base | - | 8192 | 56.73 | 69.03 | 37.79 | 44.68 | 47.23 | 94.54 | 49.81 | 78.16 | 45.38 | 30.69 | 64.57 | 87.77 | 20.60 | 73.57 | 27.36 | 79.89 | ## Hiring We have open positions for **Research Interns** and **Full-Time Researchers** to join our team at Tongyi Lab. We are seeking passionate individuals with expertise in representation learning, LLM-driven information retrieval, Retrieval-Augmented Generation (RAG), and agent-based systems. Our team is located in the vibrant cities of **Beijing** and **Hangzhou**. If you are driven by curiosity and eager to make a meaningful impact through your work, we would love to hear from you. Please submit your resume along with a brief introduction to <a href=\"mailto:[email protected]\">[email protected]</a>. ## Citation If you find our paper or models helpful, feel free to give us a cite.",
25
+ "model_explanation_gemini": "Generates English text embeddings for sentence similarity and retrieval tasks, achieving competitive performance in MTEB, LoCO, and COIR evaluations compared to similar-scale models. \n\n**Features:** \n- **Model Type:** Text Embedding \n- **Base Model:** ModernBERT-base (finetuned) \n- **Language:** English \n- **Size:** 149M parameters \n- **Max Input Length:** 8192 tokens \n- **Output Dimension:** 768 \n- **Key Metrics:** MT"
26
+ }
model_data_json/BAAI_bge-multilingual-gemma2.json ADDED
The diff for this file is too large to render. See raw diff
 
model_data_json/BAAI_llm-embedder.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "BAAI/llm-embedder",
3
+ "downloads": 82703,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "safetensors",
8
+ "bert",
9
+ "feature-extraction",
10
+ "arxiv:2310.07554",
11
+ "arxiv:2309.07597",
12
+ "license:mit",
13
+ "text-embeddings-inference",
14
+ "endpoints_compatible",
15
+ "region:us"
16
+ ],
17
+ "description": "--- license: mit --- <h1 align=\"center\">FlagEmbedding</h1> <h4 align=\"center\"> <p> <a href=#model-list>Model List</a> | <a href=#frequently-asked-questions>FAQ</a> | <a href=#usage>Usage</a> | <a href=\"#evaluation\">Evaluation</a> | <a href=\"#train\">Train</a> | <a href=\"#contact\">Contact</a> | <a href=\"#citation\">Citation</a> | <a href=\"#license\">License</a> <p> </h4> More details please refer to our Github: FlagEmbedding. English | 中文 <span style=\"#FF69B4;\"> **Hiring:** We're seeking experienced NLP researchers and intern students focusing on dense retrieval and retrieval-augmented LLMs. If you're interested, please feel free to reach out to us via email at [email protected].</span> FlagEmbedding can map any text to a low-dimensional dense vector, which can be used for tasks like retrieval, classification, clustering, and semantic search. And it can also be used in vector databases for LLMs. ************* 🌟**Updates**🌟 ************* - 10/12/2023: Release LLM-Embedder, a unified embedding model to support diverse retrieval augmentation needs for LLMs. Paper :fire: - 09/15/2023: The technical report of BGE has been released - 09/15/2023: The massive training data of BGE has been released - 09/12/2023: New models: - **New reranker model**: release cross-encoder models and , which are more powerful than embedding model. We recommend to use/fine-tune them to re-rank top-k documents returned by embedding models. - **update embedding model**: release embedding model to alleviate the issue of the similarity distribution, and enhance its retrieval ability without instruction. <details> <summary>More</summary> <!-- ### More --> - 09/07/2023: Update fine-tune code: Add script to mine hard negatives and support adding instruction during fine-tuning. - 08/09/2023: BGE Models are integrated into **Langchain**, you can use it like this; C-MTEB **leaderboard** is available. - 08/05/2023: Release base-scale and small-scale models, **best performance among the models of the same size 🤗** - 08/02/2023: Release (short for BAAI General Embedding) Models, **rank 1st on MTEB and C-MTEB benchmark!** :tada: :tada: - 08/01/2023: We release the Chinese Massive Text Embedding Benchmark (**C-MTEB**), consisting of 31 test dataset. </details> ## Model List is short for . | Model | Language | | Description | query instruction for retrieval [1] | |:-------------------------------|:--------:| :--------:| :--------:|:--------:| | BAAI/llm-embedder | English | Inference Fine-tune | a unified embedding model to support diverse retrieval augmentation needs for LLMs | See README | | BAAI/bge-reranker-large | Chinese and English | Inference Fine-tune | a cross-encoder model which is more accurate but less efficient [2] | | | BAAI/bge-reranker-base | Chinese and English | Inference Fine-tune | a cross-encoder model which is more accurate but less efficient [2] | | | BAAI/bge-large-en-v1.5 | English | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-base-en-v1.5 | English | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-small-en-v1.5 | English | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-large-zh-v1.5 | Chinese | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-base-zh-v1.5 | Chinese | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-small-zh-v1.5 | Chinese | Inference Fine-tune | version 1.5 with more reasonable similarity distribution | | | BAAI/bge-large-en | English | Inference Fine-tune | :trophy: rank **1st** in MTEB leaderboard | | | BAAI/bge-base-en | English | Inference Fine-tune | a base-scale model but with similar ability to | | | BAAI/bge-small-en | English | Inference Fine-tune |a small-scale model but with competitive performance | | | BAAI/bge-large-zh | Chinese | Inference Fine-tune | :trophy: rank **1st** in C-MTEB benchmark | | | BAAI/bge-base-zh | Chinese | Inference Fine-tune | a base-scale model but with similar ability to | | | BAAI/bge-small-zh | Chinese | Inference Fine-tune | a small-scale model but with competitive performance | | [1\\]: If you need to search the relevant passages in a query, we suggest to add the instruction to the query; in other cases, no instruction is needed, just use the original query directly. In all cases, **no instruction** needs to be added to passages. [2\\]: Different from the embedding model, reranker uses question and document as input and directly output similarity instead of embedding. To balance the accuracy and time cost, cross-encoder is widely used to re-rank top-k documents retrieved by other simple models. For example, use bge embedding model to retrieve top 100 relevant documents, and then use bge reranker to re-rank the top 100 documents to get the final top-3 results. All models have been uploaded to Huggingface Hub, and you can see them at If you cannot open the Huggingface Hub, you can also download the models at . ## Frequently asked questions **1. How to fine-tune bge embedding model?** Following this example to prepare data and fine-tune your model. Some suggestions: - Mine hard negatives following this example, which can improve the retrieval performance. - In general, larger hyper-parameter brings better performance. You can expand it by enabling , (df_config.json can refer to ds_config.json, , etc. - If you pre-train bge on your data, the pre-trained model cannot be directly used to calculate similarity, and it must be fine-tuned with contrastive learning before computing similarity. - If the accuracy of the fine-tuned model is still not high, it is recommended to use/fine-tune the cross-encoder model (bge-reranker) to re-rank top-k results. Hard negatives also are needed to fine-tune reranker. <details> <summary>2. The similarity score between two dissimilar sentences is higher than 0.5</summary> <!-- ### The similarity score between two dissimilar sentences is higher than 0.5 --> **Suggest to use bge v1.5, which alleviates the issue of the similarity distribution.** Since we finetune the models by contrastive learning with a temperature of 0.01, the similarity distribution of the current BGE model is about in the interval \\[0.6, 1\\]. So a similarity score greater than 0.5 does not indicate that the two sentences are similar. For downstream tasks, such as passage retrieval or semantic similarity, **what matters is the relative order of the scores, not the absolute value.** If you need to filter similar sentences based on a similarity threshold, please select an appropriate similarity threshold based on the similarity distribution on your data (such as 0.8, 0.85, or even 0.9). </details> <details> <summary>3. When does the query instruction need to be used</summary> <!-- ### When does the query instruction need to be used --> For the , we improve its retrieval ability when not using instruction. No instruction only has a slight degradation in retrieval performance compared with using instruction. So you can generate embedding without instruction in all cases for convenience. For a retrieval task that uses short queries to find long related documents, it is recommended to add instructions for these short queries. **The best method to decide whether to add instructions for queries is choosing the setting that achieves better performance on your task.** In all cases, the documents/passages do not need to add the instruction. </details> ## Usage ### Usage for Embedding Model Here are some examples of using models with FlagEmbedding, Sentence-Transformers, Langchain, or Huggingface Transformers. #### Using FlagEmbedding If it doesn't work for you, you can see FlagEmbedding for more methods to install FlagEmbedding. For the value of the argument , see Model List. By default, FlagModel will use all available GPUs when encoding. Please set to select specific GPUs. You also can set to make all GPUs unavailable. #### Using Sentence-Transformers You can also use the models with sentence-transformers: For s2p(short query to long passage) retrieval task, each short query should start with an instruction (instructions see Model List). But the instruction is not needed for passages. #### Using Langchain You can use in langchain like this: #### Using HuggingFace Transformers With the transformers package, you can use the model like this: First, you pass your input through the transformer model, then you select the last hidden state of the first token (i.e., [CLS]) as the sentence embedding. ### Usage for Reranker Different from embedding model, reranker uses question and document as input and directly output similarity instead of embedding. You can get a relevance score by inputting query and passage to the reranker. The reranker is optimized based cross-entropy loss, so the relevance score is not bounded to a specific range. #### Using FlagEmbedding Get relevance scores (higher scores indicate more relevance): #### Using Huggingface transformers ## Evaluation models achieve **state-of-the-art performance on both MTEB and C-MTEB leaderboard!** For more details and evaluation tools see our scripts. - **MTEB**: | Model Name | Dimension | Sequence Length | Average (56) | Retrieval (15) |Clustering (11) | Pair Classification (3) | Reranking (4) | STS (10) | Summarization (1) | Classification (12) | |:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| | BAAI/bge-large-en-v1.5 | 1024 | 512 | **64.23** | **54.29** | 46.08 | 87.12 | 60.03 | 83.11 | 31.61 | 75.97 | | BAAI/bge-base-en-v1.5 | 768 | 512 | 63.55 | 53.25 | 45.77 | 86.55 | 58.86 | 82.4 | 31.07 | 75.53 | | BAAI/bge-small-en-v1.5 | 384 | 512 | 62.17 |51.68 | 43.82 | 84.92 | 58.36 | 81.59 | 30.12 | 74.14 | | bge-large-en | 1024 | 512 | 63.98 | 53.9 | 46.98 | 85.8 | 59.48 | 81.56 | 32.06 | 76.21 | | bge-base-en | 768 | 512 | 63.36 | 53.0 | 46.32 | 85.86 | 58.7 | 81.84 | 29.27 | 75.27 | | gte-large | 1024 | 512 | 63.13 | 52.22 | 46.84 | 85.00 | 59.13 | 83.35 | 31.66 | 73.33 | | gte-base | 768 | 512 | 62.39 | 51.14 | 46.2 | 84.57 | 58.61 | 82.3 | 31.17 | 73.01 | | e5-large-v2 | 1024| 512 | 62.25 | 50.56 | 44.49 | 86.03 | 56.61 | 82.05 | 30.19 | 75.24 | | bge-small-en | 384 | 512 | 62.11 | 51.82 | 44.31 | 83.78 | 57.97 | 80.72 | 30.53 | 74.37 | | instructor-xl | 768 | 512 | 61.79 | 49.26 | 44.74 | 86.62 | 57.29 | 83.06 | 32.32 | 61.79 | | e5-base-v2 | 768 | 512 | 61.5 | 50.29 | 43.80 | 85.73 | 55.91 | 81.05 | 30.28 | 73.84 | | gte-small | 384 | 512 | 61.36 | 49.46 | 44.89 | 83.54 | 57.7 | 82.07 | 30.42 | 72.31 | | text-embedding-ada-002 | 1536 | 8192 | 60.99 | 49.25 | 45.9 | 84.89 | 56.32 | 80.97 | 30.8 | 70.93 | | e5-small-v2 | 384 | 512 | 59.93 | 49.04 | 39.92 | 84.67 | 54.32 | 80.39 | 31.16 | 72.94 | | sentence-t5-xxl | 768 | 512 | 59.51 | 42.24 | 43.72 | 85.06 | 56.42 | 82.63 | 30.08 | 73.42 | | all-mpnet-base-v2 | 768 | 514 | 57.78 | 43.81 | 43.69 | 83.04 | 59.36 | 80.28 | 27.49 | 65.07 | | sgpt-bloom-7b1-msmarco | 4096 | 2048 | 57.59 | 48.22 | 38.93 | 81.9 | 55.65 | 77.74 | 33.6 | 66.19 | - **C-MTEB**: We create the benchmark C-MTEB for Chinese text embedding which consists of 31 datasets from 6 tasks. Please refer to C_MTEB for a detailed introduction. | Model | Embedding dimension | Avg | Retrieval | STS | PairClassification | Classification | Reranking | Clustering | |:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| | **BAAI/bge-large-zh-v1.5** | 1024 | **64.53** | 70.46 | 56.25 | 81.6 | 69.13 | 65.84 | 48.99 | | BAAI/bge-base-zh-v1.5 | 768 | 63.13 | 69.49 | 53.72 | 79.75 | 68.07 | 65.39 | 47.53 | | BAAI/bge-small-zh-v1.5 | 512 | 57.82 | 61.77 | 49.11 | 70.41 | 63.96 | 60.92 | 44.18 | | BAAI/bge-large-zh | 1024 | 64.20 | 71.53 | 54.98 | 78.94 | 68.32 | 65.11 | 48.39 | | bge-large-zh-noinstruct | 1024 | 63.53 | 70.55 | 53 | 76.77 | 68.58 | 64.91 | 50.01 | | BAAI/bge-base-zh | 768 | 62.96 | 69.53 | 54.12 | 77.5 | 67.07 | 64.91 | 47.63 | | multilingual-e5-large | 1024 | 58.79 | 63.66 | 48.44 | 69.89 | 67.34 | 56.00 | 48.23 | | BAAI/bge-small-zh | 512 | 58.27 | 63.07 | 49.45 | 70.35 | 63.64 | 61.48 | 45.09 | | m3e-base | 768 | 57.10 | 56.91 | 50.47 | 63.99 | 67.52 | 59.34 | 47.68 | | m3e-large | 1024 | 57.05 | 54.75 | 50.42 | 64.3 | 68.2 | 59.66 | 48.88 | | multilingual-e5-base | 768 | 55.48 | 61.63 | 46.49 | 67.07 | 65.35 | 54.35 | 40.68 | | multilingual-e5-small | 384 | 55.38 | 59.95 | 45.27 | 66.45 | 65.85 | 53.86 | 45.26 | | text-embedding-ada-002(OpenAI) | 1536 | 53.02 | 52.0 | 43.35 | 69.56 | 64.31 | 54.28 | 45.68 | | luotuo | 1024 | 49.37 | 44.4 | 42.78 | 66.62 | 61 | 49.25 | 44.39 | | text2vec-base | 768 | 47.63 | 38.79 | 43.41 | 67.41 | 62.19 | 49.45 | 37.66 | | text2vec-large | 1024 | 47.36 | 41.94 | 44.97 | 70.86 | 60.66 | 49.16 | 30.02 | - **Reranking**: See C_MTEB for evaluation script. | Model | T2Reranking | T2RerankingZh2En\\* | T2RerankingEn2Zh\\* | MMarcoReranking | CMedQAv1 | CMedQAv2 | Avg | |:-------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| | text2vec-base-multilingual | 64.66 | 62.94 | 62.51 | 14.37 | 48.46 | 48.6 | 50.26 | | multilingual-e5-small | 65.62 | 60.94 | 56.41 | 29.91 | 67.26 | 66.54 | 57.78 | | multilingual-e5-large | 64.55 | 61.61 | 54.28 | 28.6 | 67.42 | 67.92 | 57.4 | | multilingual-e5-base | 64.21 | 62.13 | 54.68 | 29.5 | 66.23 | 66.98 | 57.29 | | m3e-base | 66.03 | 62.74 | 56.07 | 17.51 | 77.05 | 76.76 | 59.36 | | m3e-large | 66.13 | 62.72 | 56.1 | 16.46 | 77.76 | 78.27 | 59.57 | | bge-base-zh-v1.5 | 66.49 | 63.25 | 57.02 | 29.74 | 80.47 | 84.88 | 63.64 | | bge-large-zh-v1.5 | 65.74 | 63.39 | 57.03 | 28.74 | 83.45 | 85.44 | 63.97 | | BAAI/bge-reranker-base | 67.28 | 63.95 | 60.45 | 35.46 | 81.26 | 84.1 | 65.42 | | BAAI/bge-reranker-large | 67.6 | 64.03 | 61.44 | 37.16 | 82.15 | 84.18 | 66.09 | \\* : T2RerankingZh2En and T2RerankingEn2Zh are cross-language retrieval tasks ## Train ### BAAI Embedding We pre-train the models using retromae and train them on large-scale pair data using contrastive learning. **You can fine-tune the embedding model on your data following our examples.** We also provide a pre-train example. Note that the goal of pre-training is to reconstruct the text, and the pre-trained model cannot be used for similarity calculation directly, it needs to be fine-tuned. For more training details for bge see baai_general_embedding. ### BGE Reranker Cross-encoder will perform full-attention over the input pair, which is more accurate than embedding model (i.e., bi-encoder) but more time-consuming than embedding model. Therefore, it can be used to re-rank the top-k documents returned by embedding model. We train the cross-encoder on a multilingual pair data, The data format is the same as embedding model, so you can fine-tune it easily following our example. For more details please refer to ./FlagEmbedding/reranker/README.md ### Our Contributors: <a href=\" <img src=\" /> </a> ## Contact If you have any question or suggestion related to this project, feel free to open an issue or pull request. You also can email Shitao Xiao([email protected]) and Zheng Liu([email protected]). ## Citation If you find this repository useful, please consider giving a star :star: and citation ## License FlagEmbedding is licensed under the MIT License. The released models can be used for commercial purposes free of charge.",
18
+ "model_explanation_gemini": "\"BAAI_llm-embedder is a unified embedding model that maps text to dense vectors for retrieval-augmented tasks in LLMs, supporting retrieval, classification, clustering, and semantic search.\"\n\nFeatures: \n- Generates low-dimensional dense vectors from text \n- Designed for retrieval augmentation in LLMs \n- Supports tasks like retrieval, classification, clustering, and semantic search \n- Compatible with vector databases for LLMs \n\nComparison: \nThe model is part of the FlagEmbedding"
19
+ }
model_data_json/EleutherAI_pythia-1b.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "EleutherAI/pythia-1b",
3
+ "downloads": 83131,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "safetensors",
8
+ "gpt_neox",
9
+ "text-generation",
10
+ "causal-lm",
11
+ "pythia",
12
+ "en",
13
+ "dataset:the_pile",
14
+ "arxiv:2304.01373",
15
+ "arxiv:2101.00027",
16
+ "arxiv:2201.07311",
17
+ "license:apache-2.0",
18
+ "autotrain_compatible",
19
+ "text-generation-inference",
20
+ "endpoints_compatible",
21
+ "region:us"
22
+ ],
23
+ "description": "--- language: - en tags: - pytorch - causal-lm - pythia license: apache-2.0 datasets: - the_pile --- The *Pythia Scaling Suite* is a collection of models developed to facilitate interpretability research (see paper). It contains two sets of eight models of sizes 70M, 160M, 410M, 1B, 1.4B, 2.8B, 6.9B, and 12B. For each size, there are two models: one trained on the Pile, and one trained on the Pile after the dataset has been globally deduplicated. All 8 model sizes are trained on the exact same data, in the exact same order. We also provide 154 intermediate checkpoints per model, hosted on Hugging Face as branches. The Pythia model suite was deliberately designed to promote scientific research on large language models, especially interpretability research. Despite not centering downstream performance as a design goal, we find the models <a href=\"#evaluations\">match or exceed</a> the performance of similar and same-sized models, such as those in the OPT and GPT-Neo suites. <details> <summary style=\"font-weight:600\">Details on previous early release and naming convention.</summary> Previously, we released an early version of the Pythia suite to the public. However, we decided to retrain the model suite to address a few hyperparameter discrepancies. This model card <a href=\"#changelog\">lists the changes</a>; see appendix B in the Pythia paper for further discussion. We found no difference in benchmark performance between the two Pythia versions. The old models are still available, but we suggest the retrained suite if you are just starting to use Pythia.<br> **This is the current release.** Please note that all models in the *Pythia* suite were renamed in January 2023. For clarity, a <a href=\"#naming-convention-and-parameter-count\">table comparing the old and new names</a> is provided in this model card, together with exact parameter counts. </details> <br> # Pythia-1B ## Model Details - Developed by: EleutherAI - Model type: Transformer-based Language Model - Language: English - Learn more: Pythia's GitHub repository for training procedure, config files, and details on how to use. See paper for more evals and implementation details. - Library: GPT-NeoX - License: Apache 2.0 - Contact: to ask questions about this model, join the EleutherAI Discord, and post them in . Please read the existing *Pythia* documentation before asking about it in the EleutherAI Discord. For general correspondence: contact@eleuther. ai. <figure> | Pythia model | Non-Embedding Params | Layers | Model Dim | Heads | Batch Size | Learning Rate | Equivalent Models | | -----------: | -------------------: | :----: | :-------: | :---: | :--------: | :-------------------: | :--------------------: | | 70M | 18,915,328 | 6 | 512 | 8 | 2M | 1.0 x 10<sup>-3</sup> | — | | 160M | 85,056,000 | 12 | 768 | 12 | 2M | 6.0 x 10<sup>-4</sup> | GPT-Neo 125M, OPT-125M | | 410M | 302,311,424 | 24 | 1024 | 16 | 2M | 3.0 x 10<sup>-4</sup> | OPT-350M | | 1.0B | 805,736,448 | 16 | 2048 | 8 | 2M | 3.0 x 10<sup>-4</sup> | — | | 1.4B | 1,208,602,624 | 24 | 2048 | 16 | 2M | 2.0 x 10<sup>-4</sup> | GPT-Neo 1.3B, OPT-1.3B | | 2.8B | 2,517,652,480 | 32 | 2560 | 32 | 2M | 1.6 x 10<sup>-4</sup> | GPT-Neo 2.7B, OPT-2.7B | | 6.9B | 6,444,163,072 | 32 | 4096 | 32 | 2M | 1.2 x 10<sup>-4</sup> | OPT-6.7B | | 12B | 11,327,027,200 | 36 | 5120 | 40 | 2M | 1.2 x 10<sup>-4</sup> | — | <figcaption>Engineering details for the <i>Pythia Suite</i>. Deduped and non-deduped models of a given size have the same hyperparameters. “Equivalent” models have <b>exactly</b> the same architecture, and the same number of non-embedding parameters.</figcaption> </figure> ## Uses and Limitations ### Intended Use The primary intended use of Pythia is research on the behavior, functionality, and limitations of large language models. This suite is intended to provide a controlled setting for performing scientific experiments. We also provide 154 checkpoints per model: initial , 10 log-spaced checkpoints , and 143 evenly-spaced checkpoints from to . These checkpoints are hosted on Hugging Face as branches. Note that branch corresponds exactly to the model checkpoint on the branch of each model. You may also further fine-tune and adapt Pythia-1B for deployment, as long as your use is in accordance with the Apache 2.0 license. Pythia models work with the Hugging Face Transformers Library. If you decide to use pre-trained Pythia-1B as a basis for your fine-tuned model, please conduct your own risk and bias assessment. ### Out-of-scope use The Pythia Suite is **not** intended for deployment. It is not a in itself a product and cannot be used for human-facing interactions. For example, the model may generate harmful or offensive text. Please evaluate the risks associated with your particular use case. Pythia models are English-language only, and are not suitable for translation or generating text in other languages. Pythia-1B has not been fine-tuned for downstream contexts in which language models are commonly deployed, such as writing genre prose, or commercial chatbots. This means Pythia-1B will **not** respond to a given prompt the way a product like ChatGPT does. This is because, unlike this model, ChatGPT was fine-tuned using methods such as Reinforcement Learning from Human Feedback (RLHF) to better “follow” human instructions. ### Limitations and biases The core functionality of a large language model is to take a string of text and predict the next token. The token used by the model need not produce the most “accurate” text. Never rely on Pythia-1B to produce factually accurate output. This model was trained on the Pile, a dataset known to contain profanity and texts that are lewd or otherwise offensive. See Section 6 of the Pile paper for a discussion of documented biases with regards to gender, religion, and race. Pythia-1B may produce socially unacceptable or undesirable text, *even if* the prompt itself does not include anything explicitly offensive. If you plan on using text generated through, for example, the Hosted Inference API, we recommend having a human curate the outputs of this language model before presenting it to other people. Please inform your audience that the text was generated by Pythia-1B. ### Quickstart Pythia models can be loaded and used via the following code, demonstrated here for the third checkpoint: Revision/branch corresponds exactly to the model checkpoint on the branch of each model.<br> For more information on how to use all Pythia models, see documentation on GitHub. ## Training ### Training data The Pile is a 825GiB general-purpose dataset in English. It was created by EleutherAI specifically for training large language models. It contains texts from 22 diverse sources, roughly broken down into five categories: academic writing (e.g. arXiv), internet (e.g. CommonCrawl), prose (e.g. Project Gutenberg), dialogue (e.g. YouTube subtitles), and miscellaneous (e.g. GitHub, Enron Emails). See the Pile paper for a breakdown of all data sources, methodology, and a discussion of ethical implications. Consult the datasheet for more detailed documentation about the Pile and its component datasets. The Pile can be downloaded from the official website, or from a community mirror.<br> The Pile was **not** deduplicated before being used to train Pythia-1B. ### Training procedure All models were trained on the exact same data, in the exact same order. Each model saw 299,892,736,000 tokens during training, and 143 checkpoints for each model are saved every 2,097,152,000 tokens, spaced evenly throughout training, from to (which is the same as ). In addition, we also provide frequent early checkpoints: and . This corresponds to training for just under 1 epoch on the Pile for non-deduplicated models, and about 1.5 epochs on the deduplicated Pile. All *Pythia* models trained for 143000 steps at a batch size of 2M (2,097,152 tokens).<br> See GitHub for more details on training procedure, including how to reproduce it.<br> Pythia uses the same tokenizer as GPT-NeoX- 20B. ## Evaluations All 16 *Pythia* models were evaluated using the LM Evaluation Harness. You can access the results by model and step at in the GitHub repository.<br> Expand the sections below to see plots of evaluation results for all Pythia and Pythia-deduped models compared with OPT and BLOOM. <details> <summary>LAMBADA – OpenAI</summary> <img src=\"/EleutherAI/pythia-12b/resolve/main/eval_plots/lambada_openai_v1.png\" style=\"width:auto\"/> </details> <details> <summary>Physical Interaction: Question Answering (PIQA)</summary> <img src=\"/EleutherAI/pythia-12b/resolve/main/eval_plots/piqa_v1.png\" style=\"width:auto\"/> </details> <details> <summary>WinoGrande</summary> <img src=\"/EleutherAI/pythia-12b/resolve/main/eval_plots/winogrande_v1.png\" style=\"width:auto\"/> </details> <details> <summary>AI2 Reasoning Challenge—Easy Set</summary> <img src=\"/EleutherAI/pythia-12b/resolve/main/eval_plots/arc_easy_v1.png\" style=\"width:auto\"/> </details> <details> <summary>SciQ</summary> <img src=\"/EleutherAI/pythia-12b/resolve/main/eval_plots/sciq_v1.png\" style=\"width:auto\"/> </details> ## Changelog This section compares differences between previously released Pythia v0 and the current models. See Appendix B of the Pythia paper for further discussion of these changes and the motivation behind them. We found that retraining Pythia had no impact on benchmark performance. - All model sizes are now trained with uniform batch size of 2M tokens. Previously, the models of size 160M, 410M, and 1.4B parameters were trained with batch sizes of 4M tokens. - We added checkpoints at initialization (step 0) and steps {1,2,4,8,16,32,64, 128,256,512} in addition to every 1000 training steps. - Flash Attention was used in the new retrained suite. - We remedied a minor inconsistency that existed in the original suite: all models of size 2.8B parameters or smaller had a learning rate (LR) schedule which decayed to a minimum LR of 10% the starting LR rate, but the 6.9B and 12B models all used an LR schedule which decayed to a minimum LR of 0. In the redone training runs, we rectified this inconsistency: all models now were trained with LR decaying to a minimum of 0.1× their maximum LR. ### Naming convention and parameter count *Pythia* models were renamed in January 2023. It is possible that the old naming convention still persists in some documentation by accident. The current naming convention (70M, 160M, etc.) is based on total parameter count. <figure style=\"width:32em\"> | current Pythia suffix | old suffix | total params | non-embedding params | | --------------------: | ---------: | -------------: | -------------------: | | 70M | 19M | 70,426,624 | 18,915,328 | | 160M | 125M | 162,322,944 | 85,056,000 | | 410M | 350M | 405,334,016 | 302,311,424 | | 1B | 800M | 1,011,781,632 | 805,736,448 | | 1.4B | 1.3B | 1,414,647,808 | 1,208,602,624 | | 2.8B | 2.7B | 2,775,208,960 | 2,517,652,480 | | 6.9B | 6.7B | 6,857,302,016 | 6,444,163,072 | | 12B | 13B | 11,846,072,320 | 11,327,027,200 | </figure>",
24
+ "model_explanation_gemini": "A 1-billion-parameter English language model designed for interpretability research, trained on the Pile dataset with reproducible checkpoints, matching or exceeding similar-sized models like OPT and GPT-Neo.\n\n**Features:** \n- Transformer-based causal language model \n- Part of the Pythia Scaling Suite (1B parameters) \n- Trained on the Pile dataset (with deduplicated variant available) \n- 154 intermediate checkpoints provided \n- Matches/exceeds performance of comparable models"
25
+ }
model_data_json/FacebookAI_xlm-roberta-large-finetuned-conll03-english.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
3
+ "downloads": 79650,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "rust",
8
+ "onnx",
9
+ "safetensors",
10
+ "xlm-roberta",
11
+ "token-classification",
12
+ "multilingual",
13
+ "af",
14
+ "am",
15
+ "ar",
16
+ "as",
17
+ "az",
18
+ "be",
19
+ "bg",
20
+ "bn",
21
+ "br",
22
+ "bs",
23
+ "ca",
24
+ "cs",
25
+ "cy",
26
+ "da",
27
+ "de",
28
+ "el",
29
+ "en",
30
+ "eo",
31
+ "es",
32
+ "et",
33
+ "eu",
34
+ "fa",
35
+ "fi",
36
+ "fr",
37
+ "fy",
38
+ "ga",
39
+ "gd",
40
+ "gl",
41
+ "gu",
42
+ "ha",
43
+ "he",
44
+ "hi",
45
+ "hr",
46
+ "hu",
47
+ "hy",
48
+ "id",
49
+ "is",
50
+ "it",
51
+ "ja",
52
+ "jv",
53
+ "ka",
54
+ "kk",
55
+ "km",
56
+ "kn",
57
+ "ko",
58
+ "ku",
59
+ "ky",
60
+ "la",
61
+ "lo",
62
+ "lt",
63
+ "lv",
64
+ "mg",
65
+ "mk",
66
+ "ml",
67
+ "mn",
68
+ "mr",
69
+ "ms",
70
+ "my",
71
+ "ne",
72
+ "nl",
73
+ "no",
74
+ "om",
75
+ "or",
76
+ "pa",
77
+ "pl",
78
+ "ps",
79
+ "pt",
80
+ "ro",
81
+ "ru",
82
+ "sa",
83
+ "sd",
84
+ "si",
85
+ "sk",
86
+ "sl",
87
+ "so",
88
+ "sq",
89
+ "sr",
90
+ "su",
91
+ "sv",
92
+ "sw",
93
+ "ta",
94
+ "te",
95
+ "th",
96
+ "tl",
97
+ "tr",
98
+ "ug",
99
+ "uk",
100
+ "ur",
101
+ "uz",
102
+ "vi",
103
+ "xh",
104
+ "yi",
105
+ "zh",
106
+ "arxiv:1911.02116",
107
+ "arxiv:2008.03415",
108
+ "arxiv:1910.09700",
109
+ "autotrain_compatible",
110
+ "endpoints_compatible",
111
+ "region:us"
112
+ ],
113
+ "description": "--- language: - multilingual - af - am - ar - as - az - be - bg - bn - br - bs - ca - cs - cy - da - de - el - en - eo - es - et - eu - fa - fi - fr - fy - ga - gd - gl - gu - ha - he - hi - hr - hu - hy - id - is - it - ja - jv - ka - kk - km - kn - ko - ku - ky - la - lo - lt - lv - mg - mk - ml - mn - mr - ms - my - ne - nl - no - om - or - pa - pl - ps - pt - ro - ru - sa - sd - si - sk - sl - so - sq - sr - su - sv - sw - ta - te - th - tl - tr - ug - uk - ur - uz - vi - xh - yi - zh --- # xlm-roberta-large-finetuned-conll03-english # Table of Contents 1. Model Details 2. Uses 3. Bias, Risks, and Limitations 4. Training 5. Evaluation 6. Environmental Impact 7. Technical Specifications 8. Citation 9. Model Card Authors 10. How To Get Started With the Model # Model Details ## Model Description The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data. This model is XLM-RoBERTa-large fine-tuned with the conll2003 dataset in English. - **Developed by:** See associated paper - **Model type:** Multi-lingual language model - **Language(s) (NLP) or Countries (images):** XLM-RoBERTa is a multilingual model trained on 100 different languages; see GitHub Repo for full list; model is fine-tuned on a dataset in English - **License:** More information needed - **Related Models:** RoBERTa, XLM - **Parent Model:** XLM-RoBERTa-large - **Resources for more information:** -GitHub Repo -Associated Paper # Uses ## Direct Use The model is a language model. The model can be used for token classification, a natural language understanding task in which a label is assigned to some tokens in a text. ## Downstream Use Potential downstream use cases include Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging. To learn more about token classification and other potential downstream use cases, see the Hugging Face token classification docs. ## Out-of-Scope Use The model should not be used to intentionally create hostile or alienating environments for people. # Bias, Risks, and Limitations **CONTENT WARNING: Readers should be made aware that language generated by this model may be disturbing or offensive to some and may propagate historical and current stereotypes.** Significant research has explored bias and fairness issues with language models (see, e.g., Sheng et al. (2021) and Bender et al. (2021)). In the context of tasks relevant to this model, Mishra et al. (2020) explore social biases in NER systems for English and find that there is systematic bias in existing NER systems in that they fail to identify named entities from different demographic groups (though this paper did not look at BERT). For example, using a sample sentence from Mishra et al. (2020): ## Recommendations Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. # Training See the following resources for training data and training procedure details: - XLM-RoBERTa-large model card - CoNLL-2003 data card - Associated paper # Evaluation See the associated paper for evaluation details. # Environmental Impact Carbon emissions can be estimated using the Machine Learning Impact calculator presented in Lacoste et al. (2019). - **Hardware Type:** 500 32GB Nvidia V100 GPUs (from the associated paper) - **Hours used:** More information needed - **Cloud Provider:** More information needed - **Compute Region:** More information needed - **Carbon Emitted:** More information needed # Technical Specifications See the associated paper for further details. # Citation **BibTeX:** **APA:** - Conneau, A., Khandelwal, K., Goyal, N., Chaudhary, V., Wenzek, G., Guzmán, F., ... & Stoyanov, V. (2019). Unsupervised cross-lingual representation learning at scale. arXiv preprint arXiv:1911.02116. # Model Card Authors This model card was written by the team at Hugging Face. # How to Get Started with the Model Use the code below to get started with the model. You can use this model directly within a pipeline for NER. <details> <summary> Click to expand </summary> </details>",
114
+ "model_explanation_gemini": "\"Finetuned for English token classification tasks like Named Entity Recognition (NER) using the CoNLL-2003 dataset, based on a multilingual XLM-RoBERTa-large model.\"\n\nFeatures: \n- Multilingual base (XLM-RoBERTa-large) fine-tuned specifically for English \n- Task: Token classification (e.g., NER, POS tagging) \n- Trained on CoNLL-2003 dataset \n\nComparison: \nUnlike generic multilingual models, this variant"
115
+ }
model_data_json/Helsinki-NLP_opus-mt-en-da.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Helsinki-NLP/opus-mt-en-da",
3
+ "downloads": 81331,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tf",
8
+ "marian",
9
+ "text2text-generation",
10
+ "translation",
11
+ "en",
12
+ "da",
13
+ "license:apache-2.0",
14
+ "autotrain_compatible",
15
+ "endpoints_compatible",
16
+ "region:us"
17
+ ],
18
+ "description": "--- tags: - translation license: apache-2.0 --- ### opus-mt-en-da * source languages: en * target languages: da * OPUS readme: en-da * dataset: opus * model: transformer-align * pre-processing: normalization + SentencePiece * download original weights: opus-2019-12-18.zip * test set translations: opus-2019-12-18.test.txt * test set scores: opus-2019-12-18.eval.txt ## Benchmarks | testset | BLEU | chr-F | |-----------------------|-------|-------| | Tatoeba.en.da | 60.4 | 0.745 |",
19
+ "model_explanation_gemini": "Translates English (en) to Danish (da) using a transformer-align model with SentencePiece pre-processing. \n\nFeatures: \n- Source language: English (en) \n- Target language: Danish (da) \n- Architecture: transformer-align \n- Pre-processing: normalization + SentencePiece \n- Training dataset: OPUS \n- Benchmarked on Tatoeba with BLEU (60.4) and chr-F (0.745) scores \n\nComparison: \nNo explicit comparison with other"
20
+ }
model_data_json/HuggingFaceTB_SmolLM2-1.7B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
3
+ "downloads": 80704,
4
+ "tags": [
5
+ "transformers",
6
+ "tensorboard",
7
+ "onnx",
8
+ "safetensors",
9
+ "llama",
10
+ "text-generation",
11
+ "transformers.js",
12
+ "conversational",
13
+ "en",
14
+ "arxiv:2502.02737",
15
+ "base_model:HuggingFaceTB/SmolLM2-1.7B",
16
+ "base_model:quantized:HuggingFaceTB/SmolLM2-1.7B",
17
+ "license:apache-2.0",
18
+ "autotrain_compatible",
19
+ "text-generation-inference",
20
+ "endpoints_compatible",
21
+ "region:us"
22
+ ],
23
+ "description": "--- library_name: transformers license: apache-2.0 language: - en pipeline_tag: text-generation tags: - safetensors - onnx - transformers.js base_model: - HuggingFaceTB/SmolLM2-1.7B --- # SmolLM2 !image/png ## Table of Contents 1. Model Summary 2. Evaluation 3. Examples 4. Limitations 5. Training 6. License 7. Citation ## Model Summary SmolLM2 is a family of compact language models available in three size: 135M, 360M, and 1.7B parameters. They are capable of solving a wide range of tasks while being lightweight enough to run on-device. More details in our paper: The 1.7B variant demonstrates significant advances over its predecessor SmolLM1-1.7B, particularly in instruction following, knowledge, reasoning, and mathematics. It was trained on 11 trillion tokens using a diverse dataset combination: FineWeb-Edu, DCLM, The Stack, along with new mathematics and coding datasets that we curated and will release soon. We developed the instruct version through supervised fine-tuning (SFT) using a combination of public datasets and our own curated datasets. We then applied Direct Preference Optimization (DPO) using UltraFeedback. The instruct model additionally supports tasks such as text rewriting, summarization and function calling thanks to datasets developed by Argilla such as Synth-APIGen-v0.1. You can find the SFT dataset here: For more details refer to: You will find pre-training, post-training, evaluation and local inference code. ### How to use #### Transformers #### Chat in TRL You can also use the TRL CLI to chat with the model from the terminal: #### Transformers.js ## Evaluation In this section, we report the evaluation results of SmolLM2. All evaluations are zero-shot unless stated otherwise, and we use lighteval to run them. ## Base Pre-Trained Model | Metric | SmolLM2-1.7B | Llama-1B | Qwen2.5-1.5B | SmolLM1-1.7B | |------------------|--------------|-------------|---------------|--------------| | HellaSwag | **68.7** | 61.2 | 66.4 | 62.9 | | ARC (Average) | **60.5** | 49.2 | 58.5 | 59.9 | | PIQA | **77.6** | 74.8 | 76.1 | 76.0 | | MMLU-Pro (MCF) | **19.4** | 11.7 | 13.7 | 10.8 | | CommonsenseQA | **43.6** | 41.2 | 34.1 | 38.0 | | TriviaQA | **36.7** | 28.1 | 20.9 | 22.5 | | Winogrande | **59.4** | 57.8 | 59.3 | 54.7 | | OpenBookQA | 42.2 | 38.4 | 40.0 | **42.4** | | GSM8K (5-shot) | 31.0 | 7.2 | **61.3** | 5.5 | ## Instruction Model | Metric | SmolLM2-1.7B-Instruct | Llama-1B-Instruct | Qwen2.5-1.5B-Instruct | SmolLM1-1.7B-Instruct | |:-----------------------------|:---------------------:|:-----------------:|:----------------------:|:----------------------:| | IFEval (Average prompt/inst) | **56.7** | 53.5 | 47.4 | 23.1 | | MT-Bench | 6.13 | 5.48 | **6.52** | 4.33 | | OpenRewrite-Eval (micro_avg RougeL) | 44.9 | 39.2 | **46.9** | NaN | | HellaSwag | **66.1** | 56.1 | 60.9 | 55.5 | | ARC (Average) | **51.7** | 41.6 | 46.2 | 43.7 | | PIQA | **74.4** | 72.3 | 73.2 | 71.6 | | MMLU-Pro (MCF) | 19.3 | 12.7 | **24.2** | 11.7 | | BBH (3-shot) | 32.2 | 27.6 | **35.3** | 25.7 | | GSM8K (5-shot) | **48.2** | 26.8 | 42.8 | 4.62 | ## Examples Below are some system and instruct prompts that work well for special tasks ### Text rewriting ### Summarization ### Function calling SmolLM2-1.7B-Instruct can handle function calling, it scores 27% on the BFCL Leaderboard. Here's how you can leverage it: More details such as parallel function calls and tools not available can be found here ## Limitations SmolLM2 models primarily understand and generate content in English. They can produce text on a variety of topics, but the generated content may not always be factually accurate, logically consistent, or free from biases present in the training data. These models should be used as assistive tools rather than definitive sources of information. Users should always verify important information and critically evaluate any generated content. ## Training ### Model - **Architecture:** Transformer decoder - **Pretraining tokens:** 11T - **Precision:** bfloat16 ### Hardware - **GPUs:** 256 H100 ### Software - **Training Framework:** nanotron - **Alignment Handbook** alignment-handbook ## License Apache 2.0 ## Citation",
24
+ "model_explanation_gemini": "A compact 1.7B-parameter instruction-tuned language model optimized for on-device use, excelling in tasks like text rewriting, summarization, function calling, and reasoning while outperforming similar-sized models like Llama-1B and SmolLM1-1.7B in instruction following and knowledge tasks. \n\n**Features:** \n- Lightweight design for on-device deployment \n- Instruction fine-tuning (SFT + DPO) for improved task performance \n- Supports text"
25
+ }
model_data_json/HuggingFaceTB_SmolVLM-Instruct.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "HuggingFaceTB/SmolVLM-Instruct",
3
+ "downloads": 72887,
4
+ "tags": [
5
+ "transformers",
6
+ "onnx",
7
+ "safetensors",
8
+ "idefics3",
9
+ "image-text-to-text",
10
+ "conversational",
11
+ "en",
12
+ "dataset:HuggingFaceM4/the_cauldron",
13
+ "dataset:HuggingFaceM4/Docmatix",
14
+ "arxiv:2504.05299",
15
+ "base_model:HuggingFaceTB/SmolLM2-1.7B-Instruct",
16
+ "base_model:quantized:HuggingFaceTB/SmolLM2-1.7B-Instruct",
17
+ "license:apache-2.0",
18
+ "endpoints_compatible",
19
+ "region:us"
20
+ ],
21
+ "description": "--- library_name: transformers license: apache-2.0 datasets: - HuggingFaceM4/the_cauldron - HuggingFaceM4/Docmatix pipeline_tag: image-text-to-text language: - en base_model: - HuggingFaceTB/SmolLM2-1.7B-Instruct - google/siglip-so400m-patch14-384 --- <img src=\" width=\"800\" height=\"auto\" alt=\"Image description\"> # SmolVLM SmolVLM is a compact open multimodal model that accepts arbitrary sequences of image and text inputs to produce text outputs. Designed for efficiency, SmolVLM can answer questions about images, describe visual content, create stories grounded on multiple images, or function as a pure language model without visual inputs. Its lightweight architecture makes it suitable for on-device applications while maintaining strong performance on multimodal tasks. ## Model Summary - **Developed by:** Hugging Face 🤗 - **Model type:** Multi-modal model (image+text) - **Language(s) (NLP):** English - **License:** Apache 2.0 - **Architecture:** Based on Idefics3 (see technical summary) ## Resources - **Demo:** SmolVLM Demo - **Blog:** Blog post ## Uses SmolVLM can be used for inference on multimodal (image + text) tasks where the input comprises text queries along with one or more images. Text and images can be interleaved arbitrarily, enabling tasks like image captioning, visual question answering, and storytelling based on visual content. The model does not support image generation. To fine-tune SmolVLM on a specific task, you can follow the fine-tuning tutorial. <!-- todo: add link to fine-tuning tutorial --> ### Technical Summary SmolVLM leverages the lightweight SmolLM2 language model to provide a compact yet powerful multimodal experience. It introduces several changes compared to previous Idefics models: - **Image compression:** We introduce a more radical image compression compared to Idefics3 to enable the model to infer faster and use less RAM. - **Visual Token Encoding:** SmolVLM uses 81 visual tokens to encode image patches of size 384×384. Larger images are divided into patches, each encoded separately, enhancing efficiency without compromising performance. More details about the training and architecture are available in our technical report. ### How to get started You can use transformers to load, infer and fine-tune SmolVLM. ### Model optimizations **Precision**: For better performance, load and run the model in half-precision ( or ) if your hardware supports it. You can also load SmolVLM with 4/8-bit quantization using bitsandbytes, torchao or Quanto. Refer to this page for other options. **Vision Encoder Efficiency**: Adjust the image resolution by setting when initializing the processor, where N is your desired value. The default works well, which results in input images of size 1536×1536. For documents, might be beneficial. Decreasing N can save GPU memory and is appropriate for lower-resolution images. This is also useful if you want to fine-tune on videos. ## Misuse and Out-of-scope Use SmolVLM is not intended for high-stakes scenarios or critical decision-making processes that affect an individual's well-being or livelihood. The model may produce content that appears factual but may not be accurate. Misuse includes, but is not limited to: - Prohibited Uses: - Evaluating or scoring individuals (e.g., in employment, education, credit) - Critical automated decision-making - Generating unreliable factual content - Malicious Activities: - Spam generation - Disinformation campaigns - Harassment or abuse - Unauthorized surveillance ### License SmolVLM is built upon the shape-optimized SigLIP as image encoder and SmolLM2 for text decoder part. We release the SmolVLM checkpoints under the Apache 2.0 license. ## Training Details ### Training Data The training data comes from The Cauldron and Docmatix datasets, with emphasis on document understanding (25%) and image captioning (18%), while maintaining balanced coverage across other crucial capabilities like visual reasoning, chart comprehension, and general instruction following. <img src=\" alt=\"Example Image\" style=\"width:90%;\" /> ## Evaluation | Model | MMMU (val) | MathVista (testmini) | MMStar (val) | DocVQA (test) | TextVQA (val) | Min GPU RAM required (GB) | |-------------------|------------|----------------------|--------------|---------------|---------------|---------------------------| | SmolVLM | 38.8 | 44.6 | 42.1 | 81.6 | 72.7 | 5.02 | | Qwen-VL 2B | 41.1 | 47.8 | 47.5 | 90.1 | 79.7 | 13.70 | | InternVL2 2B | 34.3 | 46.3 | 49.8 | 86.9 | 73.4 | 10.52 | | PaliGemma 3B 448px| 34.9 | 28.7 | 48.3 | 32.2 | 56.0 | 6.72 | | moondream2 | 32.4 | 24.3 | 40.3 | 70.5 | 65.2 | 3.87 | | MiniCPM-V-2 | 38.2 | 39.8 | 39.1 | 71.9 | 74.1 | 7.88 | | MM1.5 1B | 35.8 | 37.2 | 0.0 | 81.0 | 72.5 | NaN | # Citation information You can cite us in the following way:",
22
+ "model_explanation_gemini": "\"SmolVLM is a compact multimodal model that processes interleaved image and text inputs to generate text outputs, excelling in tasks like visual question answering, image captioning, and storytelling while optimized for efficiency and on-device use.\"\n\n**Model Features:** \n1. Multimodal (image + text) input with arbitrary interleaving \n2. Text-only capability (functions as language model) \n3. Lightweight architecture for on-device applications \n4. 81 visual tokens"
23
+ }
model_data_json/HuggingFaceTB_SmolVLM2-2.2B-Instruct.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
3
+ "downloads": 74571,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "smolvlm",
8
+ "image-text-to-text",
9
+ "video-text-to-text",
10
+ "conversational",
11
+ "en",
12
+ "dataset:HuggingFaceM4/the_cauldron",
13
+ "dataset:HuggingFaceM4/Docmatix",
14
+ "dataset:lmms-lab/LLaVA-OneVision-Data",
15
+ "dataset:lmms-lab/M4-Instruct-Data",
16
+ "dataset:HuggingFaceFV/finevideo",
17
+ "dataset:MAmmoTH-VL/MAmmoTH-VL-Instruct-12M",
18
+ "dataset:lmms-lab/LLaVA-Video-178K",
19
+ "dataset:orrzohar/Video-STaR",
20
+ "dataset:Mutonix/Vript",
21
+ "dataset:TIGER-Lab/VISTA-400K",
22
+ "dataset:Enxin/MovieChat-1K_train",
23
+ "dataset:ShareGPT4Video/ShareGPT4Video",
24
+ "arxiv:2504.05299",
25
+ "base_model:HuggingFaceTB/SmolVLM-Instruct",
26
+ "base_model:finetune:HuggingFaceTB/SmolVLM-Instruct",
27
+ "license:apache-2.0",
28
+ "endpoints_compatible",
29
+ "region:us"
30
+ ],
31
+ "description": "--- library_name: transformers license: apache-2.0 datasets: - HuggingFaceM4/the_cauldron - HuggingFaceM4/Docmatix - lmms-lab/LLaVA-OneVision-Data - lmms-lab/M4-Instruct-Data - HuggingFaceFV/finevideo - MAmmoTH-VL/MAmmoTH-VL-Instruct-12M - lmms-lab/LLaVA-Video-178K - orrzohar/Video-STaR - Mutonix/Vript - TIGER-Lab/VISTA-400K - Enxin/MovieChat-1K_train - ShareGPT4Video/ShareGPT4Video pipeline_tag: image-text-to-text tags: - video-text-to-text language: - en base_model: - HuggingFaceTB/SmolVLM-Instruct --- <img src=\" width=\"800\" height=\"auto\" alt=\"Image description\"> # SmolVLM2 2.2B SmolVLM2-2.2B is a lightweight multimodal model designed to analyze video content. The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 5.2GB of GPU RAM for video inference, it delivers robust performance on complex multimodal tasks. This efficiency makes it particularly well-suited for on-device applications where computational resources may be limited. ## Model Summary - **Developed by:** Hugging Face 🤗 - **Model type:** Multi-modal model (image/multi-image/video/text) - **Language(s) (NLP):** English - **License:** Apache 2.0 - **Architecture:** Based on Idefics3 (see technical summary) ## Resources - **Demo:** Video Highlight Generator - **Blog:** Blog post ## Uses SmolVLM2 can be used for inference on multimodal (video / image / text) tasks where the input consists of text queries along with video or one or more images. Text and media files can be interleaved arbitrarily, enabling tasks like captioning, visual question answering, and storytelling based on visual content. The model does not support image or video generation. To fine-tune SmolVLM2 on a specific task, you can follow the fine-tuning tutorial. ## Evaluation ### Vision Evaluation | Model | Mathvista | MMMU | OCRBench | MMStar | AI2D | ChartQA_Test | Science_QA | TextVQA Val | DocVQA Val | |-------------------|-----------|-------|----------|--------|------|--------------|------------|-------------|------------| | **SmolVLM2 2.2B** | 51.5 | 42 | 72.9 | 46 | 70 | 68.84 | 90 | 73.21 | 79.98 | | SmolVLM 2.2B | 43.9 | 38.3 | 65.5 | 41.8 | 84.5 | 71.6 | 84.5 | 72.1 | 79.7 | ### Video Evaluation We evaluated the performance of the SmolVLM2 family on the following scientific benchmarks: | Size | Video-MME | MLVU | MVBench | |----------|-----------------|----------|---------------| | 2.2B | 52.1 | 55.2 | 46.27 | | 500M | 42.2 | 47.3 | 39.73 | | 256M | 33.7 | 40.6 | 32.7 | ### How to get started You can use transformers to load, infer and fine-tune SmolVLM. Make sure you have num2words, flash-attn and latest transformers installed. You can load the model as follows. #### Simple Inference You preprocess your inputs directly using chat templates and directly passing them #### Video Inference To use SmolVLM2 for video inference, make sure you have decord installed. #### Multi-image Interleaved Inference You can interleave multiple media with text using chat templates. ### Model optimizations ## Misuse and Out-of-scope Use SmolVLM is not intended for high-stakes scenarios or critical decision-making processes that affect an individual's well-being or livelihood. The model may produce content that appears factual but may not be accurate. Misuse includes, but is not limited to: - Prohibited Uses: - Evaluating or scoring individuals (e.g., in employment, education, credit) - Critical automated decision-making - Generating unreliable factual content - Malicious Activities: - Spam generation - Disinformation campaigns - Harassment or abuse - Unauthorized surveillance ### License SmolVLM2 is built upon the shape-optimized SigLIP as image encoder and SmolLM2 for text decoder part. We release the SmolVLM2 checkpoints under the Apache 2.0 license. ## Citation information You can cite us in the following way: ## Training Data SmolVLM2 used 3.3M samples for training originally from ten different datasets: LlaVa Onevision, M4-Instruct, Mammoth, LlaVa Video 178K, FineVideo, VideoStar, VRipt, Vista-400K, MovieChat and ShareGPT4Video. In the following plots we give a general overview of the samples across modalities and the source of those samples. <!-- <center><img src=\" width=\"auto\" height=\"auto\" alt=\"Image description\"> </center> ### Details <img src=\" width=\"auto\" height=\"auto\" alt=\"Image description\"> --> ## Data Split per modality | Data Type | Percentage | |--------------|------------| | Image | 34.4% | | Text | 20.2% | | Video | 33.0% | | Multi-image | 12.3% | ## Granular dataset slices per modality ### Text Datasets | Dataset | Percentage | |--------------------------------------------|------------| | llava-onevision/magpie_pro_ft3_80b_mt | 6.8% | | llava-onevision/magpie_pro_ft3_80b_tt | 6.8% | | llava-onevision/magpie_pro_qwen2_72b_tt | 5.8% | | llava-onevision/mathqa | 0.9% | ### Multi-image Datasets | Dataset | Percentage | |--------------------------------------------|------------| | m4-instruct-data/m4_instruct_multiimage | 10.4% | | mammoth/multiimage-cap6 | 1.9% | ### Image Datasets | Dataset | Percentage | |--------------------------------------------|------------| | llava-onevision/other | 17.4% | | llava-onevision/vision_flan | 3.9% | | llava-onevision/mavis_math_metagen | 2.6% | | llava-onevision/mavis_math_rule_geo | 2.5% | | llava-onevision/sharegpt4o | 1.7% | | llava-onevision/sharegpt4v_coco | 1.5% | | llava-onevision/image_textualization | 1.3% | | llava-onevision/sharegpt4v_llava | 0.9% | | llava-onevision/mapqa | 0.9% | | llava-onevision/qa | 0.8% | | llava-onevision/textocr | 0.8% | ### Video Datasets | Dataset | Percentage | |--------------------------------------------|------------| | llava-video-178k/1-2m | 7.3% | | llava-video-178k/2-3m | 7.0% | | other-video/combined | 5.7% | | llava-video-178k/hound | 4.4% | | llava-video-178k/0-30s | 2.4% | | video-star/starb | 2.2% | | vista-400k/combined | 2.2% | | vript/long | 1.0% | | ShareGPT4Video/all | 0.8% |",
32
+ "model_explanation_gemini": "A lightweight multimodal model analyzing video, image, and text inputs to generate text outputs for tasks like visual question answering, captioning, and transcription, optimized for efficiency in resource-limited environments. \n\n**Features:** \n- Multimodal (video/image/text input, text output) \n- Tasks: captioning, visual QA, storytelling, text transcription \n- Compact size (5.2GB GPU RAM for video inference) \n- No image/video generation capability \n- Apache 2.0 license"
33
+ }
model_data_json/KBLab_kb-whisper-small.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "KBLab/kb-whisper-small",
3
+ "downloads": 28474,
4
+ "tags": [
5
+ "transformers",
6
+ "onnx",
7
+ "safetensors",
8
+ "whisper",
9
+ "automatic-speech-recognition",
10
+ "ctranslate2",
11
+ "sv",
12
+ "dataset:KBLab/rixvox-v2",
13
+ "base_model:openai/whisper-small",
14
+ "base_model:quantized:openai/whisper-small",
15
+ "license:apache-2.0",
16
+ "endpoints_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- library_name: transformers base_model: openai/whisper-small language: - sv pipeline_tag: automatic-speech-recognition license: apache-2.0 datasets: - KBLab/rixvox-v2 tags: - ctranslate2 --- ## KB-Whisper Small The National Library of Sweden releases a new suite of Whisper models trained on over 50,000 hours of Swedish speech. In evaluations across FLEURS, CommonVoice and NST, our best performing model reduces the Word Error Rate (WER) by an average of 47% compared to OpenAI's . The performance of smaller Whisper model sizes on Swedish speech has also substantially improved, with outperforming (a model six times its size). | Model size | | FLEURS | CommonVoice | NST | |------------|---------|--------|-------------|------| | tiny | **KBLab** | **13.2** | **12.9** | **11.2** | | | OpenAI | 59.2 | 67.8 | 85.2 | | base | **KBLab** | **9.1** | **8.7** | **7.8** | | | OpenAI | 39.6 | 52.1 | 53.4 | | small | **KBLab** | **7.3** | **6.4** | **6.6** | | | OpenAI | 20.6 | 26.4 | 26.4 | | medium | **KBLab** | **6.6** | **5.4** | **5.8** | | | OpenAI | 12.1 | 15.8 | 17.1 | | large-v3 | **KBLab** | **5.4** | **4.1** | **5.2** | | | OpenAI | 7.8 | 9.5 | 11.3 | Table: **Word Error Rate (WER)** comparison between KBLab's Whisper models and the corresponding OpenAI versions. ### Usage We provide checkpoints in different formats: , (GGML), , and (used in and ). #### Hugging Face Inference example for using with Hugging Face: #### Faster-whisper Faster-whisper provides fast and efficient inference via a reimplementation of Whisper using . #### WhisperX WhisperX provides a convenient method of getting accurate word level timestamps. The library combines (force aligns) the text output of Whisper with the accurate timestamps of Wav2vec2. We provide an example below of how to use together with KBLab/wav2vec2-large-voxrex-swedish. #### Whisper.cpp / GGML We provide GGML checkpoints used in the apps and . To use our model with first clone the repository and build the library: To use the model you need to download one of the GGML checkpoints we have uploaded. You can either press the download buttons here, or download using : Run inference by specifying the model path after the argument , along with the path to the audio file as the last positional argument. #### onnx (optimum) and transformers.js usage You can use the checkpoints via Hugging Face's library in the following manner: An example of an app that runs inference locally in the browser with and can be found at (created by Pierre Mesure). A template for setting up such an app with javascript can be found at ### Training data Our models have been trained on over 50,000 hours of Swedish audio with text transcriptions. The models were trained in 2 stages, each characterized by the application of different quality filters and thresholds for said filters. Stage 1 employed low threshold values (0 to 0.30 BLEU depending on dataset), whereas Stage 2 used stricter thresholds (, weighted ROUGE-N , CER of first and last 10 characters ). | Dataset | Continued pretraining (h) -- Stage 1 | Finetuning (h) -- Stage 2 | |-------------|--------------------------|--------------| | Subtitles | 34,261 | 3,110 | | Riksdag | 21,949 | 5,119 | | ISOF | 54 | 54 | | NST | 250 | 250 | | **Total** | **56,514** | **8,533** | The default when loading our models through Hugging Face is **Stage 2**. We have however also uploaded continued pretraining checkpoints and tagged them. You can load these other checkpoints by specifying the in . The pretrained checkpoints tag can for example be found here: []( The Stage 2 default model tag is named . We supply a different stage 2 checkpoint -- with a more condensed style of transcribing -- under the name . ### Evaluation #### WER | Model size | | FLEURS | CommonVoice | NST | |------------|---------|--------|-------------|------| | tiny | **KBLab** | **13.2** | **12.9** | **11.2** | | | OpenAI | 59.2 | 67.8 | 85.2 | | base | **KBLab** | **9.1** | **8.7** | **7.8** | | | OpenAI | 39.6 | 52.1 | 53.4 | | small | **KBLab** | **7.3** | **6.4** | **6.6** | | | OpenAI | 20.6 | 26.4 | 26.4 | | medium | **KBLab** | **6.6** | **5.4** | **5.8** | | | OpenAI | 12.1 | 15.8 | 17.1 | | large-v3 | **KBLab** | **5.4** | **4.1** | **5.2** | | | OpenAI | 7.8 | 9.5 | 11.3 | #### BLEU Score | Model size | | FLEURS | CommonVoice | NST | |------------|---------|--------|-------------|------| | tiny | KBLab | **76.6** | **73.7** | **74.3** | | | OpenAI | 26.9 | 21.1 | 24.0 | | base | KBLab | **83.2** | **79.9** | **78.3** | | | OpenAI | 41.1 | 32.5 | 36.9 | | small | KBLab | **86.6** | **83.5** | **79.6** | | | OpenAI | 64.0 | 56.5 | 58.2 | | medium | KBLab | **87.6** | **85.0** | **80.2** | | | OpenAI | 77.1 | 70.1 | 68.9 | | large-v3 | KBLab | **89.8** | **87.2** | **81.1** | | | OpenAI | 84.9 | 79.1 | 75.1 | ### Acknowledgements We acknowledge the EuroHPC Joint Undertaking for awarding this project access to the EuroHPC supercomputer LEONARDO, hosted by CINECA (Italy) and the LEONARDO consortium through an EuroHPC AI and Data-Intensive Applications Access call. ### Citation Paper reference coming soon.",
20
+ "model_explanation_gemini": "A Swedish-optimized automatic speech recognition model trained on 50,000+ hours of Swedish speech, significantly reducing word error rates compared to OpenAI's Whisper models. \n\n**Features**: \n- Optimized for Swedish speech recognition \n- Trained on 50,000+ hours of Swedish audio \n- Lower Word Error Rate (WER) than OpenAI's Whisper models (47% average improvement) \n- Available in multiple formats (CT2, GGML, ONNX, Transformers"
21
+ }
model_data_json/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
3
+ "downloads": 69297,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "exaone",
8
+ "text-generation",
9
+ "lg-ai",
10
+ "conversational",
11
+ "custom_code",
12
+ "en",
13
+ "ko",
14
+ "arxiv:2408.03541",
15
+ "license:other",
16
+ "autotrain_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: other license_name: exaone license_link: LICENSE language: - en - ko tags: - lg-ai - exaone --- <p align=\"center\"> <img src=\"assets/EXAONE_Symbol+BI_3d.png\", width=\"300\", style=\"margin: 40 auto;\"> <br> # EXAONE-3.0-7.8B-Instruct **👋👋 We have revised our license for revitalizing the research ecosystem.👋👋** ## Introduction We introduce EXAONE-3.0-7.8B-Instruct, a pre-trained and instruction-tuned bilingual (English and Korean) generative model with 7.8 billion parameters. The model was pre-trained with 8T curated tokens and post-trained with supervised fine-tuning and direct preference optimization. It demonstrates highly competitive benchmark performance against other state-of-the-art open models of similar size. For more details, please refer to our technical report, blog and GitHub. ## Quickstart We recommend to use transformers v4.41 or later. > ### Note > The EXAONE 3.0 instruction-tuned language model was trained to utilize the system prompt, > so we highly recommend using the system prompts provided in the code snippet above. ## Evaluation We compared EXAONE-3.0-7.8B-Instruct with similar-sized instruction-tuned LLMs. To verify the performance of real-world use cases, we measured benchmarks that have a high correlation with LMSYS Chatbot Arena. Some experimental results are shown below. The full evaluation results can be found in the technical report. | Language | Benchmark | EXAONE 3.0 <br>7.8B Inst. | Llama 3.1 <br>8B Inst. | Gemma 2 <br>9B Inst. | QWEN 2 <br>7B Inst. | Phi 3 <br>7B Inst. | Mistral 7B <br>Inst. | | :-----: | :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | | English | MT-Bench | **9.01** | 7.95 | 8.52 | 8.41 | 8.52 | 7.72 | | | Arena-Hard-v0.1 | **46.8** | 28.0 | 42.1 | 21.7 | 29.1 | 16.2 | | | WildBench | **48.2** | 34.5 | 41.5 | 34.9 | 32.8 | 29.0 | | | AlpacaEval 2.0 LC | 45.0 | 31.5 | **47.5** | 24.5 | 37.1 | 31.0 | | Korean | KoMT-Bench<sup>[1] | **8.92** | 6.06 | 7.92 | 7.69 | 4.87 | 5.20 | | | LogicKor | **8.62** | 5.40 | 8.07 | 6.12 | 3.76 | 3.42 | - [1] KoMT-Bench is a dataset created by translating MT-Bench into Korean; see README for more details. ## Limitation The EXAONE language model has certain limitations and may occasionally generate inappropriate responses. The language model generates responses based on the output probability of tokens, and it is determined during learning from training data. While we have made every effort to exclude personal, harmful, and biased information from the training data, some problematic content may still be included, potentially leading to undesirable responses. Please note that the text generated by EXAONE language model does not reflects the views of LG AI Research. - Inappropriate answers may be generated, which contain personal, harmful or other inappropriate information. - Biased responses may be generated, which are associated with age, gender, race, and so on. - The generated responses rely heavily on statistics from the training data, which can result in the generation of semantically or syntactically incorrect sentences. - Since the model does not reflect the latest information, the responses may be false or contradictory. LG AI Research strives to reduce potential risks that may arise from EXAONE language model. Users are not allowed to engage in any malicious activities (e.g., keying in illegal information) that may induce the creation of inappropriate outputs violating LG AI’s ethical principles when using EXAONE language model. ## License The model is licensed under EXAONE AI Model License Agreement 1.1 - NC ## Citation ## Contact LG AI Research Technical Support: [email protected]",
20
+ "model_explanation_gemini": "A 7.8B-parameter bilingual (English and Korean) instruction-tuned generative model optimized for competitive performance in benchmarks, trained with supervised fine-tuning and direct preference optimization. \n\n**Features:** \n- Bilingual (English and Korean) \n- 7.8 billion parameters \n- Pre-trained on 8T tokens \n- Instruction-tuned with supervised fine-tuning and DPO \n- Competitive benchmark performance against similar-sized models \n- Requires system prompts for optimal use \n\n**Comparison:**"
21
+ }
model_data_json/LorenzoDeMattei_GePpeTto.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "LorenzoDeMattei/GePpeTto",
3
+ "downloads": 78170,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "jax",
8
+ "safetensors",
9
+ "gpt2",
10
+ "text-generation",
11
+ "it",
12
+ "arxiv:2004.14253",
13
+ "autotrain_compatible",
14
+ "text-generation-inference",
15
+ "endpoints_compatible",
16
+ "region:us"
17
+ ],
18
+ "description": "--- language: it --- # GePpeTto GPT2 Model 🇮🇹 Pretrained GPT2 117M model for Italian. You can find further details in the paper: Lorenzo De Mattei, Michele Cafagna, Felice Dell’Orletta, Malvina Nissim, Marco Guerini \"GePpeTto Carves Italian into a Language Model\", arXiv preprint. Pdf available at: ## Pretraining Corpus The pretraining set comprises two main sources. The first one is a dump of Italian Wikipedia (November 2019), consisting of 2.8GB of text. The second one is the ItWac corpus (Baroni et al., 2009), which amounts to 11GB of web texts. This collection provides a mix of standard and less standard Italian, on a rather wide chronological span, with older texts than the Wikipedia dump (the latter stretches only to the late 2000s). ## Pretraining details This model was trained using GPT2's Hugging Face implemenation on 4 NVIDIA Tesla T4 GPU for 620k steps. Training parameters: - GPT-2 small configuration - vocabulary size: 30k - Batch size: 32 - Block size: 100 - Adam Optimizer - Initial learning rate: 5e-5 - Warm up steps: 10k ## Perplexity scores | Domain | Perplexity | |---|---| | Wikipedia | 26.1052 | | ItWac | 30.3965 | | Legal | 37.2197 | | News | 45.3859 | | Social Media | 84.6408 | For further details, qualitative analysis and human evaluation check out: ## Load Pretrained Model You can use this model by installing Huggingface library . And you can use it directly by initializing it like this: ## Example using GPT2LMHeadModel Output is, ## Citation Please use the following bibtex entry: ## References Marco Baroni, Silvia Bernardini, Adriano Ferraresi, and Eros Zanchetta. 2009. The WaCky wide web: a collection of very large linguistically processed webcrawled corpora. Language resources and evaluation, 43(3):209–226.",
19
+ "model_explanation_gemini": "\"GePpeTto is a 117M-parameter GPT-2 model pretrained on Italian text from Wikipedia and ItWac for generating and processing Italian language.\"\n\n**Model Features:** \n- Language: Italian (🇮🇹) \n- Architecture: GPT-2 small (117M parameters) \n- Pretraining Corpus: Italian Wikipedia (2.8GB) + ItWac (11GB) \n- Training: 620k steps on 4× Tesla T4"
20
+ }
model_data_json/M-CLIP_XLM-Roberta-Large-Vit-B-16Plus.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus",
3
+ "downloads": 83225,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tf",
8
+ "multilingual",
9
+ "af",
10
+ "sq",
11
+ "am",
12
+ "ar",
13
+ "az",
14
+ "bn",
15
+ "bs",
16
+ "bg",
17
+ "ca",
18
+ "zh",
19
+ "hr",
20
+ "cs",
21
+ "da",
22
+ "nl",
23
+ "en",
24
+ "et",
25
+ "fr",
26
+ "de",
27
+ "el",
28
+ "hi",
29
+ "hu",
30
+ "is",
31
+ "id",
32
+ "it",
33
+ "ja",
34
+ "mk",
35
+ "ml",
36
+ "mr",
37
+ "pl",
38
+ "pt",
39
+ "ro",
40
+ "ru",
41
+ "sr",
42
+ "sl",
43
+ "es",
44
+ "sw",
45
+ "sv",
46
+ "tl",
47
+ "te",
48
+ "tr",
49
+ "tk",
50
+ "uk",
51
+ "ur",
52
+ "ug",
53
+ "uz",
54
+ "vi",
55
+ "xh",
56
+ "endpoints_compatible",
57
+ "region:us"
58
+ ],
59
+ "description": "--- language: - multilingual - af - sq - am - ar - az - bn - bs - bg - ca - zh - hr - cs - da - nl - en - et - fr - de - el - hi - hu - is - id - it - ja - mk - ml - mr - pl - pt - ro - ru - sr - sl - es - sw - sv - tl - te - tr - tk - uk - ur - ug - uz - vi - xh --- ## Multilingual-clip: XLM-Roberta-Large-Vit-B-16Plus Multilingual-CLIP extends OpenAI's English text encoders to multiple other languages. This model *only* contains the multilingual text encoder. The corresponding image model can be retrieved via instructions found on open_clip repository on Github. We provide a usage example below. ## Requirements To use both the multilingual text encoder and corresponding image encoder, we need to install the packages []( and []( ## Usage Extracting embeddings from the text encoder can be done in the following way: Extracting embeddings from the corresponding image encoder: ## Evaluation results None of the M-CLIP models have been extensivly evaluated, but testing them on Txt2Img retrieval on the humanly translated MS-COCO dataset, we see the following **R@10** results: | Name | En | De | Es | Fr | Zh | It | Pl | Ko | Ru | Tr | Jp | | ----------------------------------|:-----: |:-----: |:-----: |:-----: | :-----: |:-----: |:-----: |:-----: |:-----: |:-----: |:-----: | | OpenAI CLIP Vit-B/32| 90.3 | - | - | - | - | - | - | - | - | - | - | | OpenAI CLIP Vit-L/14| 91.8 | - | - | - | - | - | - | - | - | - | - | | OpenCLIP ViT-B-16+-| 94.3 | - | - | - | - | - | - | - | - | - | - | | LABSE Vit-L/14| 91.6 | 89.6 | 89.5 | 89.9 | 88.9 | 90.1 | 89.8 | 80.8 | 85.5 | 89.8 | 73.9 | | XLM-R Large Vit-B/32| 91.8 | 88.7 | 89.1 | 89.4 | 89.3 | 89.8| 91.4 | 82.1 | 86.1 | 88.8 | 81.0 | | XLM-R Vit-L/14| 92.4 | 90.6 | 91.0 | 90.0 | 89.7 | 91.1 | 91.3 | 85.2 | 85.8 | 90.3 | 81.9 | | XLM-R Large Vit-B/16+| **95.0** | **93.0** | **93.6** | **93.1** | **94.0** | **93.1** | **94.4** | **89.0** | **90.0** | **93.0** | **84.2** | ## Training/Model details Further details about the model training and data can be found in the model card.",
60
+ "model_explanation_gemini": "Extends OpenAI's CLIP text encoder to multiple languages for generating multilingual text embeddings that pair with an image encoder for cross-modal tasks like text-to-image retrieval.\n\nModel Features: \n- Multilingual text encoder (supports 50+ languages) \n- Based on XLM-Roberta-Large architecture \n- Paired with ViT-B-16Plus image encoder (separately available) \n- Designed for cross-modal retrieval tasks \n\nComparison: \nThe XLM-R Large Vit-B/16"
61
+ }
model_data_json/MahmoodLab_TITAN.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "MahmoodLab/TITAN",
3
+ "downloads": 239360,
4
+ "tags": [
5
+ "safetensors",
6
+ "titan",
7
+ "histology",
8
+ "pathology",
9
+ "vision",
10
+ "pytorch",
11
+ "self-supervised",
12
+ "vit",
13
+ "image-feature-extraction",
14
+ "custom_code",
15
+ "en",
16
+ "arxiv:2411.19666",
17
+ "license:cc-by-nc-nd-4.0",
18
+ "region:us"
19
+ ],
20
+ "description": "--- license: cc-by-nc-nd-4.0 language: - en tags: - histology - pathology - vision - pytorch - self-supervised - vit extra_gated_prompt: >- This model and associated code are released under the CC-BY-NC-ND 4.0 license and may only be used for non-commercial, academic research purposes with proper attribution. Any commercial use, sale, or other monetization of the TITAN model and its derivatives, which include models trained on outputs from the TITAN model or datasets created from the TITAN model, is prohibited and requires prior approval. Please note that the primary email used to sign up for your Hugging Face account must match your institutional email to receive approval. By downloading the model, you attest that all information (affiliation, research use) is correct and up-to-date. Downloading the model requires prior registration on Hugging Face and agreeing to the terms of use. By downloading this model, you agree not to distribute, publish or reproduce a copy of the model. If another user within your organization wishes to use the TITAN model, they must register as an individual user and agree to comply with the terms of use. Users may not attempt to re-identify the deidentified data used to develop the underlying model. If you are a commercial entity, please contact the corresponding author. extra_gated_fields: Full name (first and last): text Current affiliation (no abbreviations): text Type of Affiliation: type: select options: - Academia - Industry - label: Other value: other Current and official institutional email (**this must match your primary email in your Hugging Face account, @gmail/@hotmail/@qq email domains will be denied**): text Please explain your intended research use: text I agree to all terms outlined above: checkbox I agree to use this model for non-commercial, academic purposes only: checkbox I agree not to distribute the model, if another user within your organization wishes to use the TITAN model, they must register as an individual user: checkbox metrics: - accuracy pipeline_tag: image-feature-extraction --- # Model Card for TITAN-preview \\[Preprint\\] | \\[Github Repo\\] | \\[Cite\\] ## What is TITAN? **TITAN** (**T**ransformer-based pathology **I**mage and **T**ext **A**lignment **N**etwork) is a multimodal whole-slide foundation model pre-trained using visual self-supervised learning and vision-language alignment. It leverages 335,645 whole-slide images (WSIs) from a diverse set of internally collected neoplastic, infectious, and inflammatory cases at Mass General Brigham. Additionally, TITAN utilizes over 182,000 pathology reports and more than 423,000 synthetic captions generated by PathChat, our pathology co-pilot. TITAN's slide embeddings achieve state-of-the-art performance on diverse downstream tasks, including linear probing, few-shot and zero-shot classification, rare cancer retrieval, cross-modal retrieval, and pathology report generation. This is a preview and we will bring you further updates and improvements. **your request will be denied**. To fix this, you can: (1) add your official institutional email to your HF account, and confirm your email address to verify, and (2) set your institutional email as your primary email in your HF account. Other reasons for your request access being denied include other mistakes in the form submitted, for example: full name includes abbreviations, affiliation is not spelled out, the described research use is not sufficient, or email domain address not recognized. ## Model Description - **Developed by:** Mahmood Lab AI for Pathology @ Harvard/BWH - **Model type:** Pretrained vision-language encoders - **Pretraining dataset:** Mass-340K, sourced from private histology collections (BWH / MGH), in addition to slides from the public GTEx consortium. - **Repository:** - **Preprint:** - **License:** CC-BY-NC-ND-4.0 ### Requirements ### Model Usage TITAN-preview is a vision-lanuage model trained on CONCH v1.5 patch features with patch size of 512x512 pixels at 20x magnification. Following authentication (using ), both TITAN-preview (slide and language encoders) and CONCH v1.5 (patch encoder) can be loaded using the commands below: You can directly use TITAN-preview for slide-level feature extaction. TITAN builds a feature grids from CONCH v1.5 patch features using the coordinates and the distance between the patches. As patch coordinates are always saved at the slides' level 0 magnification, TITAN takes patch_size_lv0 which represents the distance between two adjacent patches at level 0 magnification. It is 1024 if slide is 40x, or 512 if slide is 20x. We have this info saved in our demo TCGA features. Slide-level feature extraction can be done in the following way: These pre-extracted features can then be used for slide-level classification (via linear probing), retrieval (via l2 distance), and other machine learning settings, without task-specific finetuning. We also released all TCGA TITAN-preview features in . We demonstrated more detailed linear probe and zero-shot evaluation in our github. ## License and Terms of Use This model and associated code are released under the CC-BY-NC-ND 4.0 license and may only be used for non-commercial, academic research purposes with proper attribution. Any commercial use, sale, or other monetization of the TITAN model and its derivatives, which include models trained on outputs from the TITAN model or datasets created from the TITAN model, is prohibited and requires prior approval. Downloading the model requires prior registration on Hugging Face and agreeing to the terms of use. By downloading this model, you agree not to distribute, publish or reproduce a copy of the model. If another user within your organization wishes to use the TITAN model, they must register as an individual user and agree to comply with the terms of use. Users may not attempt to re-identify the deidentified data used to develop the underlying model. If you are a commercial entity, please contact the corresponding author. ## Contact For any additional questions or comments, contact Faisal Mahmood (), \\ Tong Ding (), \\ Sophia J. Wagner (), \\ Andrew H. Song (), \\ or Richard J. Chen (), ## Acknowledgements The project was built on top of amazing repositories such as ViT, iBOT, OpenClip, LGSSL, and Timm (ViT model implementation). We thank the authors and developers for their contribution. ## BibTeX If you found our work useful in your research, please consider citing our work at: Ding, T.\\*, Wagner S.J.\\*, Song, A.H.\\*, Chen, R.J.\\* et al. Multimodal Whole Slide Foundation Model for Pathology, Arxiv, 2024",
21
+ "model_explanation_gemini": "TITAN is a multimodal foundation model for pathology, combining visual self-supervised learning and vision-language alignment to analyze whole-slide images and generate pathology reports, achieving state-of-the-art performance in tasks like classification, retrieval, and report generation.\n\nFeatures: \n- Multimodal (image + text) pathology model \n- Pretrained on 335,645 WSIs and 182,000+ reports \n- Self-supervised ViT architecture \n- Supports zero-shot/few-shot learning \n-"
22
+ }
model_data_json/MoritzLaurer_bge-m3-zeroshot-v2.0.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
3
+ "downloads": 70245,
4
+ "tags": [
5
+ "transformers",
6
+ "onnx",
7
+ "safetensors",
8
+ "xlm-roberta",
9
+ "text-classification",
10
+ "zero-shot-classification",
11
+ "multilingual",
12
+ "arxiv:2312.17543",
13
+ "base_model:BAAI/bge-m3-retromae",
14
+ "base_model:quantized:BAAI/bge-m3-retromae",
15
+ "license:mit",
16
+ "autotrain_compatible",
17
+ "endpoints_compatible",
18
+ "region:us"
19
+ ],
20
+ "description": "--- language: - multilingual tags: - text-classification - zero-shot-classification base_model: BAAI/bge-m3-retromae pipeline_tag: zero-shot-classification library_name: transformers license: mit --- # Model description: bge-m3-zeroshot-v2.0 ## zeroshot-v2.0 series of models Models in this series are designed for efficient zeroshot classification with the Hugging Face pipeline. These models can do classification without training data and run on both GPUs and CPUs. An overview of the latest zeroshot classifiers is available in my Zeroshot Classifier Collection. The main update of this series of models is that several models are trained on fully commercially-friendly data for users with strict license requirements. These models can do one universal classification task: determine whether a hypothesis is \"true\" or \"not true\" given a text ( vs. ). This task format is based on the Natural Language Inference task (NLI). The task is so universal that any classification task can be reformulated into this task by the Hugging Face pipeline. ## Training data Models with a \"\" in the name are trained on two types of fully commercially-friendly data: 1. Synthetic data generated with Mixtral-8x7B-Instruct-v0.1. I first created a list of 500+ diverse text classification tasks for 25 professions in conversations with Mistral-large. The data was manually curated. I then used this as seed data to generate several hundred thousand texts for these tasks with Mixtral-8x7B-Instruct-v0.1. The final dataset used is available in the synthetic_zeroshot_mixtral_v0.1 dataset in the subset . Data curation was done in multiple iterations and will be improved in future iterations. 2. Two commercially-friendly NLI datasets: (MNLI, FEVER-NLI). These datasets were added to increase generalization. 3. Models without a \"\" in the name also included a broader mix of training data with a broader mix of licenses: ANLI, WANLI, LingNLI, and all datasets in this list where . ## How to use the models forces the model to decide on only one class. enables the model to choose multiple classes. ## Metrics The models were evaluated on 28 different text classification tasks with the f1_macro metric. The main reference point is which is, at the time of writing (03.04.24), the most used commercially-friendly 0-shot classifier. !results_aggreg_v2.0 | | facebook/bart-large-mnli | roberta-base-zeroshot-v2.0-c | roberta-large-zeroshot-v2.0-c | deberta-v3-base-zeroshot-v2.0-c | deberta-v3-base-zeroshot-v2.0 (fewshot) | deberta-v3-large-zeroshot-v2.0-c | deberta-v3-large-zeroshot-v2.0 (fewshot) | bge-m3-zeroshot-v2.0-c | bge-m3-zeroshot-v2.0 (fewshot) | |:---------------------------|---------------------------:|-----------------------------:|------------------------------:|--------------------------------:|-----------------------------------:|---------------------------------:|------------------------------------:|-----------------------:|--------------------------:| | all datasets mean | 0.497 | 0.587 | 0.622 | 0.619 | 0.643 (0.834) | 0.676 | 0.673 (0.846) | 0.59 | (0.803) | | amazonpolarity (2) | 0.937 | 0.924 | 0.951 | 0.937 | 0.943 (0.961) | 0.952 | 0.956 (0.968) | 0.942 | (0.951) | | imdb (2) | 0.892 | 0.871 | 0.904 | 0.893 | 0.899 (0.936) | 0.923 | 0.918 (0.958) | 0.873 | (0.917) | | appreviews (2) | 0.934 | 0.913 | 0.937 | 0.938 | 0.945 (0.948) | 0.943 | 0.949 (0.962) | 0.932 | (0.954) | | yelpreviews (2) | 0.948 | 0.953 | 0.977 | 0.979 | 0.975 (0.989) | 0.988 | 0.985 (0.994) | 0.973 | (0.978) | | rottentomatoes (2) | 0.83 | 0.802 | 0.841 | 0.84 | 0.86 (0.902) | 0.869 | 0.868 (0.908) | 0.813 | (0.866) | | emotiondair (6) | 0.455 | 0.482 | 0.486 | 0.459 | 0.495 (0.748) | 0.499 | 0.484 (0.688) | 0.453 | (0.697) | | emocontext (4) | 0.497 | 0.555 | 0.63 | 0.59 | 0.592 (0.799) | 0.699 | 0.676 (0.81) | 0.61 | (0.798) | | empathetic (32) | 0.371 | 0.374 | 0.404 | 0.378 | 0.405 (0.53) | 0.447 | 0.478 (0.555) | 0.387 | (0.455) | | financialphrasebank (3) | 0.465 | 0.562 | 0.455 | 0.714 | 0.669 (0.906) | 0.691 | 0.582 (0.913) | 0.504 | (0.895) | | banking77 (72) | 0.312 | 0.124 | 0.29 | 0.421 | 0.446 (0.751) | 0.513 | 0.567 (0.766) | 0.387 | (0.715) | | massive (59) | 0.43 | 0.428 | 0.543 | 0.512 | 0.52 (0.755) | 0.526 | 0.518 (0.789) | 0.414 | (0.692) | | wikitoxic_toxicaggreg (2) | 0.547 | 0.751 | 0.766 | 0.751 | 0.769 (0.904) | 0.741 | 0.787 (0.911) | 0.736 | (0.9) | | wikitoxic_obscene (2) | 0.713 | 0.817 | 0.854 | 0.853 | 0.869 (0.922) | 0.883 | 0.893 (0.933) | 0.783 | (0.914) | | wikitoxic_threat (2) | 0.295 | 0.71 | 0.817 | 0.813 | 0.87 (0.946) | 0.827 | 0.879 (0.952) | 0.68 | (0.947) | | wikitoxic_insult (2) | 0.372 | 0.724 | 0.798 | 0.759 | 0.811 (0.912) | 0.77 | 0.779 (0.924) | 0.783 | (0.915) | | wikitoxic_identityhate (2) | 0.473 | 0.774 | 0.798 | 0.774 | 0.765 (0.938) | 0.797 | 0.806 (0.948) | 0.761 | (0.931) | | hateoffensive (3) | 0.161 | 0.352 | 0.29 | 0.315 | 0.371 (0.862) | 0.47 | 0.461 (0.847) | 0.291 | (0.823) | | hatexplain (3) | 0.239 | 0.396 | 0.314 | 0.376 | 0.369 (0.765) | 0.378 | 0.389 (0.764) | 0.29 | (0.729) | | biasframes_offensive (2) | 0.336 | 0.571 | 0.583 | 0.544 | 0.601 (0.867) | 0.644 | 0.656 (0.883) | 0.541 | (0.855) | | biasframes_sex (2) | 0.263 | 0.617 | 0.835 | 0.741 | 0.809 (0.922) | 0.846 | 0.815 (0.946) | 0.748 | (0.905) | | biasframes_intent (2) | 0.616 | 0.531 | 0.635 | 0.554 | 0.61 (0.881) | 0.696 | 0.687 (0.891) | 0.467 | (0.868) | | agnews (4) | 0.703 | 0.758 | 0.745 | 0.68 | 0.742 (0.898) | 0.819 | 0.771 (0.898) | 0.687 | (0.892) | | yahootopics (10) | 0.299 | 0.543 | 0.62 | 0.578 | 0.564 (0.722) | 0.621 | 0.613 (0.738) | 0.587 | (0.711) | | trueteacher (2) | 0.491 | 0.469 | 0.402 | 0.431 | 0.479 (0.82) | 0.459 | 0.538 (0.846) | 0.471 | (0.518) | | spam (2) | 0.505 | 0.528 | 0.504 | 0.507 | 0.464 (0.973) | 0.74 | 0.597 (0.983) | 0.441 | (0.978) | | wellformedquery (2) | 0.407 | 0.333 | 0.333 | 0.335 | 0.491 (0.769) | 0.334 | 0.429 (0.815) | 0.361 | (0.718) | | manifesto (56) | 0.084 | 0.102 | 0.182 | 0.17 | 0.187 (0.376) | 0.258 | 0.256 (0.408) | 0.147 | (0.331) | | capsotu (21) | 0.34 | 0.479 | 0.523 | 0.502 | 0.477 (0.664) | 0.603 | 0.502 (0.686) | 0.472 | (0.644) | These numbers indicate zeroshot performance, as no data from these datasets was added in the training mix. Note that models without a \"\" in the title were evaluated twice: one run without any data from these 28 datasets to test pure zeroshot performance (the first number in the respective column) and the final run including up to 500 training data points per class from each of the 28 datasets (the second number in brackets in the column, \"fewshot\"). No model was trained on test data. Details on the different datasets are available here: ## When to use which model - **deberta-v3-zeroshot vs. roberta-zeroshot**: deberta-v3 performs clearly better than roberta, but it is a bit slower. roberta is directly compatible with Hugging Face's production inference TEI containers and flash attention. These containers are a good choice for production use-cases. tl;dr: For accuracy, use a deberta-v3 model. If production inference speed is a concern, you can consider a roberta model (e.g. in a TEI container and HF Inference Endpoints). - **commercial use-cases**: models with \"\" in the title are guaranteed to be trained on only commercially-friendly data. Models without a \"\" were trained on more data and perform better, but include data with non-commercial licenses. Legal opinions diverge if this training data affects the license of the trained model. For users with strict legal requirements, the models with \"\" in the title are recommended. - **Multilingual/non-English use-cases**: use bge-m3-zeroshot-v2.0 or bge-m3-zeroshot-v2.0-c. Note that multilingual models perform worse than English-only models. You can therefore also first machine translate your texts to English with libraries like EasyNMT and then apply any English-only model to the translated data. Machine translation also facilitates validation in case your team does not speak all languages in the data. - **context window**: The models can process up to 8192 tokens. The other models can process up to 512. Note that longer text inputs both make the mode slower and decrease performance, so if you're only working with texts of up to 400~ words / 1 page, use e.g. a deberta model for better performance. - The latest updates on new models are always available in the Zeroshot Classifier Collection. ## Reproduction Reproduction code is available in the directory here: ## Limitations and bias The model can only do text classification tasks. Biases can come from the underlying foundation model, the human NLI training data and the synthetic data generated by Mixtral. ## License The foundation model was published under the MIT license. The licenses of the training data vary depending on the model, see above. ## Citation This model is an extension of the research described in this paper. If you use this model academically, please cite: ### Ideas for cooperation or questions? If you have questions or ideas for cooperation, contact me at moritz{at}huggingface{dot}co or LinkedIn ### Flexible usage and \"prompting\" You can formulate your own hypotheses by changing the of the zeroshot pipeline. Similar to \"prompt engineering\" for LLMs, you can test different formulations of your and verbalized classes to improve performance.",
21
+ "model_explanation_gemini": "A multilingual zero-shot text classification model trained on commercially-friendly data to determine if a hypothesis is \"true\" or \"not true\" for a given text, reformulating any classification task into a Natural Language Inference (NLI) format. \n\n**Features:** \n- Zero-shot classification (no training data required) \n- Supports GPU and CPU inference \n- Commercially-friendly training data (synthetic Mixtral-generated data + MNLI/FEVER-NLI) \n- Handles single-label"
22
+ }
model_data_json/NbAiLab_nb-wav2vec2-300m-nynorsk.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "NbAiLab/nb-wav2vec2-300m-nynorsk",
3
+ "downloads": 73025,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tensorboard",
8
+ "safetensors",
9
+ "wav2vec2",
10
+ "automatic-speech-recognition",
11
+ "nn",
12
+ "dataset:NbAiLab/NPSC",
13
+ "arxiv:2307.01672",
14
+ "license:apache-2.0",
15
+ "model-index",
16
+ "endpoints_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: apache-2.0 tags: - automatic-speech-recognition datasets: - NbAiLab/NPSC language: - nn model-index: - name: nb-wav2vec2-300m-nynorsk results: - task: name: Automatic Speech Recognition type: automatic-speech-recognition dataset: name: NPSC type: NbAiLab/NPSC args: 16K_mp3_nynorsk metrics: - name: Test (Nynorsk) WER type: wer value: 0.1222 - name: Test (Nynorsk) CER type: cer value: 0.0419 --- # Norwegian Wav2Vec2 Model - 300M - VoxRex - Nynorsk This model is finetuned on top of feature extractor VoxRex-model from the National Library of Sweden. The finetuned model achieves the following results on the test set with a 5-gram KenLM. The numbers in parentheses are the results without the language model: - **WER: 0.1222** (0.1537) - **CER: 0.0419** (0.0468) ## Model description This is one of several Wav2Vec-models our team created during the 🤗 hosted Robust Speech Event. This is the complete list of our models and their final scores: | Model | Final WER | | |:--------------|:------------|:------------:| | NbAiLab/nb-wav2vec2-1b-bokmaal | 6.33 | | | NbAiLab/nb-wav2vec2-300m-bokmaal | 7.03 | | | NbAiLab/nb-wav2vec2-1b-nynorsk | 11.32 | | | NbAiLab/nb-wav2vec2-300m-nynorsk (this model) | 12.22 | | ### Dataset In parallel with the event, the team also converted the Norwegian Parliamentary Speech Corpus (NPSC) to the NbAiLab/NPSC in 🤗 Dataset format and used that as the main source for training. ## Code We have released all the code developed during the event so that the Norwegian NLP community can build upon it when developing even better Norwegian ASR models. The finetuning of these models is not very computationally demanding. After following the instructions here, you should be able to train your own automatic speech recognition system in less than a day with an average GPU. ## Team The following people contributed to building this model: Rolv-Arild Braaten, Per Egil Kummervold, Andre Kåsen, Javier de la Rosa, Per Erik Solberg, and Freddy Wetjen. ## Training procedure To reproduce these results, we strongly recommend that you follow the instructions from 🤗 to train a simple Swedish model. When you have verified that you are able to do this, create a fresh new repo. You can then start by copying the files and from our repo. Running these will create all the other necessary files, and should let you reproduce our results. With some tweaks to the hyperparameters, you might even be able to build an even better ASR. Good luck! ### Language Model As the scores indicate, adding even a simple 5-gram language will improve the results. 🤗 has provided another very nice blog explaining how to add a 5-gram language model to improve the ASR model. You can build this from your own corpus, for instance by extracting some suitable text from the Norwegian Colossal Corpus. You can also skip some of the steps in the guide, and copy the 5-gram model from this repo. ### Parameters The final model was run using these parameters: Using these settings, the training might take 3-4 days on an average GPU. You can, however, get a decent model and faster results by tweaking these parameters. | Parameter| Comment | |:-------------|:-----| | per_device_train_batch_size | Adjust this to the maximum of available memory. 16 or 24 might be good settings depending on your system | |gradient_accumulation_steps |Can be adjusted even further up to increase batch size and speed up training without running into memory issues | | learning_rate|Can be increased, maybe as high as 1e-4. Speeds up training but might add instability | | epochs| Can be decreased significantly. This is a huge dataset and you might get a decent result already after a couple of epochs| ## Citation See",
20
+ "model_explanation_gemini": "Finetuned for automatic speech recognition in Nynorsk, this model achieves a 12.22% WER on the NPSC dataset, enhanced by a 5-gram KenLM.\n\nFeatures: \n- **Task:** Automatic Speech Recognition (ASR) \n- **Base Model:** VoxRex from the National Library of Sweden \n- **Dataset:** NbAiLab/NPSC (Norwegian Parliamentary Speech Corpus, Nynorsk subset) \n- **Metrics:** \n - W"
21
+ }
model_data_json/OpenGVLab_InternVL2_5-38B-MPO.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "OpenGVLab/InternVL2_5-38B-MPO",
3
+ "downloads": 78877,
4
+ "tags": [
5
+ "transformers",
6
+ "tensorboard",
7
+ "safetensors",
8
+ "internvl_chat",
9
+ "feature-extraction",
10
+ "internvl",
11
+ "custom_code",
12
+ "image-text-to-text",
13
+ "conversational",
14
+ "multilingual",
15
+ "dataset:OpenGVLab/MMPR-v1.1",
16
+ "arxiv:2312.14238",
17
+ "arxiv:2404.16821",
18
+ "arxiv:2412.05271",
19
+ "arxiv:2411.10442",
20
+ "base_model:OpenGVLab/InternVL2_5-38B",
21
+ "base_model:finetune:OpenGVLab/InternVL2_5-38B",
22
+ "license:mit",
23
+ "region:us"
24
+ ],
25
+ "description": "--- license: mit pipeline_tag: image-text-to-text library_name: transformers base_model: - OpenGVLab/InternVL2_5-38B base_model_relation: finetune datasets: - OpenGVLab/MMPR-v1.1 language: - multilingual tags: - internvl - custom_code --- # InternVL2_5-38B-MPO [\\[📂 GitHub\\]]( [\\[📜 InternVL 1.0\\]]( [\\[📜 InternVL 1.5\\]]( [\\[📜 InternVL 2.5\\]]( [\\[📜 InternVL2.5-MPO\\]]( [\\[🆕 Blog\\]]( [\\[🗨️ Chat Demo\\]]( [\\[🤗 HF Demo\\]]( [\\[🚀 Quick Start\\]](#quick-start) [\\[📖 Documents\\]]( <div align=\"center\"> <img width=\"500\" alt=\"image\" src=\" </div> ## Introduction We introduce InternVL2.5-MPO, an advanced multimodal large language model (MLLM) series that demonstrates superior overall performance. This series builds upon InternVL2.5 and Mixed Preference Optimization. !image/png ## InternVL 2.5 Family In the following table, we provide an overview of the InternVL2.5-MPO series. | Model Name | Vision Part | Language Part | HF Link | | :-----------------: | :-------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | :------------------------------------------------------------: | | InternVL2_5-1B-MPO | InternViT-300M-448px-V2_5 | Qwen2.5-0.5B-Instruct | 🤗 link | | InternVL2_5-2B-MPO | InternViT-300M-448px-V2_5 | internlm2_5-1_8b-chat | 🤗 link | | InternVL2_5-4B-MPO | InternViT-300M-448px-V2_5 | Qwen2.5-3B-Instruct | 🤗 link | | InternVL2_5-8B-MPO | InternViT-300M-448px-V2_5 | internlm2_5-7b-chat | 🤗 link | | InternVL2_5-26B-MPO | InternViT-6B-448px-V2_5 | internlm2_5-20b-chat | 🤗 link | | InternVL2_5-38B-MPO | InternViT-6B-448px-V2_5 | Qwen2.5-32B-Instruct | 🤗 link | | InternVL2_5-78B-MPO | InternViT-6B-448px-V2_5 | Qwen2.5-72B-Instruct | 🤗 link | ## Model Architecture As shown in the following figure, InternVL2.5-MPO retains the same model architecture as InternVL 2.5 and its predecessors, InternVL 1.5 and 2.0, following the \"ViT-MLP-LLM\" paradigm. In this new version, we integrate a newly incrementally pre-trained InternViT with various pre-trained LLMs, including InternLM 2.5 and Qwen 2.5, using a randomly initialized MLP projector. !image/png As in the previous version, we applied a pixel unshuffle operation, reducing the number of visual tokens to one-quarter of the original. Besides, we adopted a similar dynamic resolution strategy as InternVL 1.5, dividing images into tiles of 448×448 pixels. The key difference, starting from InternVL 2.0, is that we additionally introduced support for multi-image and video data. ## Key Designs ### Multi-Modal Preference Dataset MMPR is a large-scale and high-quality multimodal reasoning preference dataset. This dataset includes about 3 million samples. !image/jpeg !image/jpeg To construct this dataset, we propose an efficient data construction pipeline. Specifically, we categorize the multimodal data into **samples with clear ground truths** and **samples without clear ground truths**. - **For samples with clear ground truths:** the model is prompted to first provide the reasoning process and then give the final answer in the format like . Responses matching the ground truth answer constitute the positive set \\\\(\\mathcal{Y}_p\\\\), while those that do not match make up the negative set \\\\(\\mathcal{Y}_n\\\\). Additionally, responses that fail to provide a clear final answer are also merged into \\\\(\\mathcal{Y}_n\\\\). Given these responses labeled as positive or negative, we build the preference pairs by selecting a chosen response \\\\(y_c\\\\) from \\\\(\\mathcal{Y}_p\\\\) and a negative response \\\\(y_r\\\\) from \\\\(\\mathcal{Y}_n\\\\). - **For samples without clear ground truths:** we propose a simple yet effective method: Dropout Next-Token Prediction (Dropout NTP). Specifically, we use the responses generated by InternVL2-8B as chosen answers. Given the chosen answer, we truncate it by half and then prompt InternVL2-8B to complete the remaining portion of the truncated answer without access to the image input. This generated completion serves as the rejected answer for the paired sample. It is worth noting that while the responses generated by InternVL2-8B may not be perfect, the completions generated without the image input will introduce more hallucinations than those generated with the image input. Therefore, the partial order relationship between the chosen and rejected responses holds true. The data construction pipeline is open-sourced, see more details in our document. ### Mixed Preference Optimization The key insight behind MPO is that *an effective PO process should enable the model to learn the relative preference between pairs of responses, the absolute quality of individual responses, and the process for generating preferred responses.* We define the training objective as a combination of preference loss \\\\(\\mathcal{L}_{\\text{p}}\\\\), quality loss \\\\(\\mathcal{L}_{\\text{q}}\\\\), and generation loss \\\\(\\mathcal{L}_{\\text{g}}\\\\), referred to as Mixed Preference Optimization: $$ \\mathcal{L}=w_{p}\\cdot\\mathcal{L}_{\\text{p}} + w_{q}\\cdot\\mathcal{L}_{\\text{q}} + w_{g}\\cdot\\mathcal{L}_{\\text{g}}, $$ where \\\\(w_{*}\\\\) represents the weight assigned to each loss component. In this work, we empirically compare different variants of preference loss. Based on the experimental results, we use DPO as our preference loss and BCO as our quality loss. Specifically, the DPO serves as the preference loss to enable the model to learn the relative preference between chosen and rejected responses. This algorithm optimizes the following loss function: $$ \\mathcal{L}_{\\text{p}}=-\\log \\sigma\\left(\\beta \\log \\frac{\\pi_\\theta\\left(y_c \\mid x\\right)}{\\pi_0\\left(y_c \\mid x\\right)}-\\beta \\log \\frac{\\pi_\\theta\\left(y_r \\mid x\\right)}{\\pi_0\\left(y_r \\mid x\\right)}\\right), $$ where \\\\(\\beta\\\\) is the KL penalty coefficient, and \\\\(x\\\\), \\\\(y_c\\\\), and \\\\(y_r\\\\) are user query, chosen response, and rejected response, respectively. The policy model \\\\(\\pi_\\theta\\\\) is initialized from model \\\\(\\pi_0\\\\). Additionally, the BCO loss is employed as the quality loss, which helps the model to understand the absolute quality of individual responses. The loss function is defined as: $$ \\mathcal{L}_{\\text{q}}=\\mathcal{L}_{\\text{q}}^+ + \\mathcal{L}_{\\text{q}}^-, $$ where \\\\(\\mathcal{L}_{\\text{q}}^{+}\\\\) and \\\\(\\mathcal{L}_{\\text{q}}^{+}\\\\) represent the loss for chosen and rejected responses, respectively. Each response type's loss is calculated independently, requiring the model to differentiate the absolute quality of individual responses. The loss terms are given by: $$ \\mathcal{L}_{\\text{q}}^+=-\\log \\sigma\\left(\\beta \\log \\frac{\\pi_\\theta\\left(y_c \\mid x\\right)}{\\pi_0\\left(y_c \\mid x\\right)} - \\delta\\right), $$ $$ \\mathcal{L}_{\\text{q}}^-=-\\log \\sigma\\left(-\\left(\\beta \\log \\frac{\\pi_\\theta\\left(y_r \\mid x\\right)}{\\pi_0\\left(y_r \\mid x\\right)} - \\delta\\right) \\right), $$ where \\\\(\\delta\\\\) represents the reward shift, calculated as the moving average of previous rewards to stabilize training. Finally, the SFT loss is used as the generation loss to help the model learn the generation process of preferred responses. The loss function is defined as: $$ \\mathcal{L}_{\\text{gen}}=-\\frac{\\log\\pi_\\theta\\left(y_c \\mid x\\right)}{\\left| y_c \\right|}. $$ ## Evaluation on Multimodal Capability To comprehensively compare InternVL's performance before and after MPO, we employ the benchmarks from OpenCompass Learderboard, including both well-established classic datasets and newly introduced ones. These benchmarks span a wide range of categories, aiming to provide a thorough and balanced assessment of InternVL’s capabilities across various multimodal tasks. We provide the evaluation results in the tables behind. | Model | Avg. | MMBench v1.1 | MMStar | MMMU | MathVista | HallusionBench | AI2D | OCRBench | MMVet | | ------------------- | ---- | ------------ | ------ | ---- | --------- | -------------- | ---- | -------- | ----- | | InternVL2-5-1B | 54.9 | 66.5 | 51.3 | 41.2 | 47.1 | 39.4 | 69.0 | 77.4 | 47.2 | | InternVL2-5-1B-MPO | 56.4 | 67.2 | 49.7 | 40.8 | 53.0 | 40.0 | 69.4 | 83.6 | 47.2 | | InternVL2-5-2B | 59.9 | 70.9 | 54.3 | 43.2 | 51.1 | 42.3 | 74.9 | 80.2 | 62.6 | | InternVL2-5-2B-MPO | 62.0 | 71.6 | 55.0 | 45.0 | 56.4 | 43.0 | 75.3 | 84.2 | 65.4 | | InternVL2-5-4B | 65.1 | 78.2 | 58.7 | 51.8 | 60.8 | 46.6 | 81.4 | 82.0 | 61.5 | | InternVL2-5-4B-MPO | 67.6 | 78.6 | 60.2 | 51.6 | 65.3 | 47.8 | 82.0 | 88.0 | 67.1 | | InternVL2-5-8B | 68.9 | 82.5 | 63.2 | 56.2 | 64.5 | 49.0 | 84.6 | 82.1 | 62.8 | | InternVL2-5-8B-MPO | 70.4 | 82.4 | 65.7 | 54.9 | 68.9 | 51.4 | 84.5 | 88.3 | 66.9 | | InternVL2-5-26B | 71.6 | 84.6 | 66.5 | 60.7 | 68.0 | 55.8 | 86.2 | 85.4 | 65.4 | | InternVL2-5-26B-MPO | 72.7 | 84.2 | 67.2 | 57.7 | 72.8 | 55.3 | 86.2 | 91.2 | 67.1 | | InternVL2-5-38B | 73.5 | 85.4 | 68.5 | 64.6 | 72.4 | 57.9 | 87.6 | 84.1 | 67.2 | | InternVL2-5-38B-MPO | 75.5 | 85.6 | 69.8 | 64.1 | 73.8 | 61.5 | 88.1 | 88.5 | 72.5 | | InternVL2-5-78B | 75.2 | 87.5 | 69.5 | 70.0 | 70.6 | 57.4 | 89.1 | 85.3 | 71.8 | | InternVL2-5-78B-MPO | 76.6 | 87.3 | 73.1 | 68.3 | 73.8 | 58.7 | 89.3 | 91.2 | 71.4 | ## Quick Start We provide an example code to run using . > Please use transformers>=4.37.2 to ensure the model works normally. ### Model Loading #### 16-bit (bf16 / fp16) #### BNB 8-bit Quantization #### Multiple GPUs The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors. ### Inference with Transformers #### Streaming Output Besides this method, you can also use the following code to get streamed output. ## Finetune Many repositories now support fine-tuning of the InternVL series models, including InternVL, SWIFT, XTurner, and others. Please refer to their documentation for more details on fine-tuning. ## Deployment ### LMDeploy LMDeploy is a toolkit for compressing, deploying, and serving LLMs & VLMs. LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline. #### A 'Hello, world' Example If occurs while executing this case, please install the required dependency packages as prompted. #### Multi-images Inference When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased. #### Batch Prompts Inference Conducting inference with batch prompts is quite straightforward; just place them within a list structure: #### Multi-turn Conversation There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the interface. #### Service LMDeploy's enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup: To use the OpenAI-style interface, you need to install OpenAI: Then, use the code below to make the API call: ## License This project is released under the MIT License. This project uses the pre-trained Qwen2.5-32B-Instruct as a component, which is licensed under the Apache License 2.0. ## Citation If you find this project useful in your research, please consider citing:",
26
+ "model_explanation_gemini": "**A multilingual multimodal large language model (MLLM) optimized for image-text-to-text tasks, built on InternVL2.5 with Mixed Preference Optimization for superior performance.** \n\n### **Features:** \n- **Multimodal Capability:** Processes both image and text inputs to generate text outputs. \n- **Multilingual Support:** Works across multiple languages. \n- **Advanced Architecture:** Uses a \"ViT-MLP-LLM\" paradigm with InternViT and LLMs like Qwen"
27
+ }
model_data_json/OpenGVLab_InternVL3-2B.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "OpenGVLab/InternVL3-2B",
3
+ "downloads": 80574,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "internvl_chat",
8
+ "feature-extraction",
9
+ "internvl",
10
+ "custom_code",
11
+ "image-text-to-text",
12
+ "conversational",
13
+ "multilingual",
14
+ "dataset:OpenGVLab/MMPR-v1.2",
15
+ "arxiv:2312.14238",
16
+ "arxiv:2404.16821",
17
+ "arxiv:2412.05271",
18
+ "arxiv:2411.10442",
19
+ "arxiv:2504.10479",
20
+ "arxiv:2412.09616",
21
+ "base_model:OpenGVLab/InternVL3-2B-Instruct",
22
+ "base_model:finetune:OpenGVLab/InternVL3-2B-Instruct",
23
+ "license:apache-2.0",
24
+ "region:us"
25
+ ],
26
+ "description": "--- license: apache-2.0 license_name: qwen license_link: pipeline_tag: image-text-to-text library_name: transformers base_model: - OpenGVLab/InternVL3-2B-Instruct base_model_relation: finetune datasets: - OpenGVLab/MMPR-v1.2 language: - multilingual tags: - internvl - custom_code --- # InternVL3-2B [\\[📂 GitHub\\]]( [\\[📜 InternVL 1.0\\]]( [\\[📜 InternVL 1.5\\]]( [\\[📜 InternVL 2.5\\]]( [\\[📜 InternVL2.5-MPO\\]]( [\\[📜 InternVL3\\]]( [\\[🆕 Blog\\]]( [\\[🗨️ Chat Demo\\]]( [\\[🤗 HF Demo\\]]( [\\[🚀 Quick Start\\]](#quick-start) [\\[📖 Documents\\]]( <div align=\"center\"> <img width=\"500\" alt=\"image\" src=\" </div> ## Introduction We introduce InternVL3, an advanced multimodal large language model (MLLM) series that demonstrates superior overall performance. Compared to InternVL 2.5, InternVL3 exhibits superior multimodal perception and reasoning capabilities, while further extending its multimodal capabilities to encompass tool usage, GUI agents, industrial image analysis, 3D vision perception, and more. Additionally, we compare InternVL3 with Qwen2.5 Chat models, whose corresponding pre-trained base models are employed as the initialization of the langauge component in InternVL3. Benefitting from Native Multimodal Pre-Training, the InternVL3 series achieves even better overall text performance than the Qwen2.5 series. !image/png ## InternVL3 Family In the following table, we provide an overview of the InternVL3 series. | Model Name | Vision Part | Language Part | HF Link | | :-----------: | :-------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | :------------------------------------------------------: | | InternVL3-1B | InternViT-300M-448px-V2_5 | Qwen2.5-0.5B | 🤗 link | | InternVL3-2B | InternViT-300M-448px-V2_5 | Qwen2.5-1.5B | 🤗 link | | InternVL3-8B | InternViT-300M-448px-V2_5 | Qwen2.5-7B | 🤗 link | | InternVL3-9B | InternViT-300M-448px-V2_5 | internlm3-8b-instruct | 🤗 link | | InternVL3-14B | InternViT-300M-448px-V2_5 | Qwen2.5-14B | 🤗 link | | InternVL3-38B | InternViT-6B-448px-V2_5 | Qwen2.5-32B | 🤗 link | | InternVL3-78B | InternViT-6B-448px-V2_5 | Qwen2.5-72B | 🤗 link | !image/png ## Model Architecture As shown in the following figure, InternVL3 retains the same model architecture as InternVL 2.5 and its predecessors, InternVL 1.5 and 2.0, following the \"ViT-MLP-LLM\" paradigm. In this new version, we integrate a newly incrementally pre-trained InternViT with various pre-trained LLMs, including InternLM 3 and Qwen 2.5, using a randomly initialized MLP projector. !image/png As in the previous version, we applied a pixel unshuffle operation, reducing the number of visual tokens to one-quarter of the original. Besides, we adopted a similar dynamic resolution strategy as InternVL 1.5, dividing images into tiles of 448×448 pixels. The key difference, starting from InternVL 2.0, is that we additionally introduced support for multi-image and video data. Notably, in InternVL3, we integrate the Variable Visual Position Encoding (V2PE), which utilizes smaller, more flexible position increments for visual tokens. Benefiting from V2PE, InternVL3 exhibits better long context understanding capabilities compared to its predecessors. ## Training Strategy ### Native Multimodal Pre-Training We propose a Native Multimodal Pre-Training approach that consolidates language and vision learning into a single pre-training stage. In contrast to standard paradigms that first train a language-only model and subsequently adapt it to handle additional modalities, our method interleaves multimodal data (e.g., image-text, video-text, or image-text interleaved sequences) with large-scale textual corpora. This unified training scheme allows the model to learn both linguistic and multimodal representations simultaneously, ultimately enhancing its capability to handle vision-language tasks without the need for separate alignment or bridging modules. Please see our paper for more details. ### Supervised Fine-Tuning In this phase, the techniques of random JPEG compression, square loss re-weighting, and multimodal data packing proposed in InternVL2.5 are also employed in the InternVL3 series. The main advancement of the SFT phase in InternVL3 compared to InternVL2.5 lies in the use of higher-quality and more diverse training data. Specifically, we further extend training samples for tool use, 3D scene understanding, GUI operations, long context tasks, video understanding, scientific diagrams, creative writing, and multimodal reasoning. ### Mixed Preference Optimization During Pre-training and SFT, the model is trained to predict the next token conditioned on previous ground-truth tokens. However, during inference, the model predicts each token based on its own prior outputs. This discrepancy between ground-truth tokens and model-predicted tokens introduces a distribution shift, which can impair the model’s Chain-of-Thought (CoT) reasoning capabilities. To mitigate this issue, we employ MPO, which introduces additional supervision from both positive and negative samples to align the model response distribution with the ground-truth distribution, thereby improving reasoning performance. Specifically, the training objective of MPO is a combination of preference loss \\\\(\\mathcal{L}_{\\text{p}}\\\\), quality loss \\\\(\\mathcal{L}_{\\text{q}}\\\\), and generation loss \\\\(\\mathcal{L}_{\\text{g}}\\\\), which can be formulated as follows: $$ \\mathcal{L}=w_{p}\\cdot\\mathcal{L}_{\\text{p}} + w_{q}\\cdot\\mathcal{L}_{\\text{q}} + w_{g}\\cdot\\mathcal{L}_{\\text{g}}, $$ where \\\\(w_{*}\\\\) represents the weight assigned to each loss component. Please see our paper for more details about MPO. ### Test-Time Scaling Test-Time Scaling has been shown to be an effective method to enhance the reasoning abilities of LLMs and MLLMs. In this work, we use the Best-of-N evaluation strategy and employ VisualPRM-8B as the critic model to select the best response for reasoning and mathematics evaluation. ## Evaluation on Multimodal Capability ### Multimodal Reasoning and Mathematics !image/png ### OCR, Chart, and Document Understanding !image/png ### Multi-Image & Real-World Comprehension !image/png ### Comprehensive Multimodal & Hallucination Evaluation !image/png ### Visual Grounding !image/png ### Multimodal Multilingual Understanding !image/png ### Video Understanding !image/png ### GUI Grounding !image/png ### Spatial Reasoning !image/png ## Evaluation on Language Capability We compare InternVL3 with Qwen2.5 Chat models, whose corresponding pre-trained base models are employed as the initialization of the langauge component in InternVL3. Benefitting from Native Multimodal Pre-Training, the InternVL3 series achieves even better overall text performance than the Qwen2.5 series. Please note that the evaluation scores of Qwen2.5 series may differ from those officially reported, as we have adopted the prompt versions provided in the table across all datasets for OpenCompass evaluation. !image/png ## Ablation Study ### Native Multimodal Pre-Training We conduct experiments on the InternVL2-8B model while keeping its architecture, initialization parameters, and training data entirely unchanged. Traditionally, InternVL2-8B employs a training pipeline that begins with an MLP warmup phase for feature alignment followed by an Instruction Tuning stage. In our experiments, we substitute the conventional MLP warmup phase with a native multimodal pre-training process. This modification isolates the contribution of native multimodal pre-training to the overall multimodal capability of the model. The evaluation results in the Figure below shows that the model with native multimodal pre-training exhibits performance on most benchmarks that is comparable to the fully multi-stage-trained InternVL2-8B baseline. Furthermore, when followed by instruction tuning on higher-quality data, the model demonstrates further performance gains across evaluated multimodal tasks. These findings underscore the efficiency of native multimodal pre-training in imparting powerful multimodal capabilities to MLLMs. !image/png ### Mixed Preference Optimization As shown in the table below, models fine-tuned with MPO demonstrate superior reasoning performance across seven multimodal reasoning benchmarks compared to their counterparts without MPO. Specifically, InternVL3-78B and InternVL3-38B outperform their counterparts by 4.1 and 4.5 points, respectively. Notably, the training data used for MPO is a subset of that used for SFT, indicating that the performance improvements primarily stem from the training algorithm rather than the training data. !image/png ### Variable Visual Position Encoding As reported in the table below, the introduction of V2PE leads to significant performance gains across most evaluation metrics. In addition, our ablation studies—by varying the positional increment \\\\( \\delta \\\\)—reveal that even for tasks primarily involving conventional contexts, relatively small \\\\( \\delta \\\\) values can achieve optimal performance. These findings provide important insights for future efforts aimed at refining position encoding strategies for visual tokens in MLLMs. !image/png ## Quick Start We provide an example code to run using . > Please use transformers>=4.37.2 to ensure the model works normally. ### Model Loading #### 16-bit (bf16 / fp16) #### BNB 8-bit Quantization #### Multiple GPUs The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors. ### Inference with Transformers #### Streaming Output Besides this method, you can also use the following code to get streamed output. ## Finetune Many repositories now support fine-tuning of the InternVL series models, including InternVL, SWIFT, XTurner, and others. Please refer to their documentation for more details on fine-tuning. ## Deployment ### LMDeploy LMDeploy is a toolkit for compressing, deploying, and serving LLMs & VLMs. LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline. #### A 'Hello, world' Example If occurs while executing this case, please install the required dependency packages as prompted. #### Multi-images Inference When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased. #### Batch Prompts Inference Conducting inference with batch prompts is quite straightforward; just place them within a list structure: #### Multi-turn Conversation There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the interface. #### Service LMDeploy's enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup: To use the OpenAI-style interface, you need to install OpenAI: Then, use the code below to make the API call: ## License This project is released under the MIT License. This project uses the pre-trained Qwen2.5 as a component, which is licensed under the Apache-2.0 License. ## Citation If you find this project useful in your research, please consider citing:",
27
+ "model_explanation_gemini": "\"InternVL3-2B is a multilingual multimodal large language model (MLLM) with enhanced multimodal perception, reasoning, and extended capabilities like tool usage, GUI agents, and industrial image analysis, outperforming its predecessor InternVL 2.5 and Qwen2.5 Chat models in text performance.\"\n\n**Features:** \n- Multimodal (image-text-to-text) \n- Multilingual support \n- Superior multimodal perception/reasoning \n- Extended capabilities (tool usage, GUI agents"
28
+ }
model_data_json/QuantFactory_Meta-Llama-3.1-8B-Instruct-GGUF.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
3
+ "downloads": 77242,
4
+ "tags": [
5
+ "gguf",
6
+ "facebook",
7
+ "meta",
8
+ "pytorch",
9
+ "llama",
10
+ "llama-3",
11
+ "text-generation",
12
+ "en",
13
+ "de",
14
+ "fr",
15
+ "it",
16
+ "pt",
17
+ "hi",
18
+ "es",
19
+ "th",
20
+ "arxiv:2204.05149",
21
+ "license:llama3.1",
22
+ "endpoints_compatible",
23
+ "region:us",
24
+ "conversational"
25
+ ],
26
+ "description": "--- language: - en - de - fr - it - pt - hi - es - th pipeline_tag: text-generation tags: - facebook - meta - pytorch - llama - llama-3 license: llama3.1 extra_gated_prompt: >- ### LLAMA 3.1 COMMUNITY LICENSE AGREEMENT Llama 3.1 Version Release Date: July 23, 2024 \"Agreement\" means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. \"Documentation\" means the specifications, manuals and documentation accompanying Llama 3.1 distributed by Meta at \"Licensee\" or \"you\" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. \"Llama 3.1\" means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at \"Llama Materials\" means, collectively, Meta’s proprietary Llama 3.1 and Documentation (and any portion thereof) made available under this Agreement. \"Meta\" or \"we\" means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland). 1. License Rights and Redistribution. a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials. b. Redistribution and Use. i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service (including another AI model) that contains any of them, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Llama” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials or any outputs or results of the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama” at the beginning of any such AI model name. ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you. iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Llama 3.1 is licensed under the Llama 3.1 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.” iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at which is hereby incorporated by reference into this Agreement. 2. Additional Commercial Terms. If, on the Llama 3.1 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights. 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS. 4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. 5. Intellectual Property. a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at ). All goodwill arising out of your use of the Mark will inure to the benefit of Meta. b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications. c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Llama 3.1 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials. 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement. 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement. ### Llama 3.1 Acceptable Use Policy Meta is committed to promoting safe and fair use of its tools and features, including Llama 3.1. If you access or use Llama 3.1, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at #### Prohibited Uses We want everyone to use Llama 3.1 safely and responsibly. You agree you will not use, or allow others to use, Llama 3.1 to: 1. Violate the law or others’ rights, including to: 1. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as: 1. Violence or terrorism 2. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material 3. Human trafficking, exploitation, and sexual violence 4. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials. 5. Sexual solicitation 6. Any other criminal activity 3. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals 4. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services 5. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices 6. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws 7. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama Materials 8. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system 2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Llama 3.1 related to the following: 1. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State 2. Guns and illegal weapons (including weapon development) 3. Illegal drugs and regulated/controlled substances 4. Operation of critical infrastructure, transportation technologies, or heavy machinery 5. Self-harm or harm to others, including suicide, cutting, and eating disorders 6. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual 3. Intentionally deceive or mislead others, including use of Llama 3.1 related to the following: 1. Generating, promoting, or furthering fraud or the creation or promotion of disinformation 2. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content 3. Generating, promoting, or further distributing spam 4. Impersonating another individual without consent, authorization, or legal right 5. Representing that the use of Llama 3.1 or outputs are human-generated 6. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement 4. Fail to appropriately disclose to end users any known dangers of your AI system Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means: * Reporting issues with the model: * Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback * Reporting bugs and security concerns: facebook.com/whitehat/info * Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3: [email protected] extra_gated_fields: First Name: text Last Name: text Date of birth: date_picker Country: country Affiliation: text Job title: type: select options: - Student - Research Graduate - AI researcher - AI developer/engineer - Reporter - Other geo: ip_location By clicking Submit below I accept the terms of the license and acknowledge that the information I provide will be collected stored processed and shared in accordance with the Meta Privacy Policy: checkbox extra_gated_description: The information you provide will be collected, stored, processed and shared in accordance with the Meta Privacy Policy. extra_gated_button_content: Submit --- is a collection of pretrained and instruction tuned generative models in 8B, 70B and 405B sizes (text in/text out). The Llama 3.1 instruction tuned text only models (8B, 70B, 405B) are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. **Model developer**: Meta **Model Architecture:** Llama 3.1 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety. <table> <tr> <td> </td> <td><strong>Training Data</strong> </td> <td><strong>Params</strong> </td> <td><strong>Input modalities</strong> </td> <td><strong>Output modalities</strong> </td> <td><strong>Context length</strong> </td> <td><strong>GQA</strong> </td> <td><strong>Token count</strong> </td> <td><strong>Knowledge cutoff</strong> </td> </tr> <tr> <td rowspan=\"3\" >Llama 3.1 (text only) </td> <td rowspan=\"3\" >A new mix of publicly available online data. </td> <td>8B </td> <td>Multilingual Text </td> <td>Multilingual Text and code </td> <td>128k </td> <td>Yes </td> <td rowspan=\"3\" >15T+ </td> <td rowspan=\"3\" >December 2023 </td> </tr> <tr> <td>70B </td> <td>Multilingual Text </td> <td>Multilingual Text and code </td> <td>128k </td> <td>Yes </td> </tr> <tr> <td>405B </td> <td>Multilingual Text </td> <td>Multilingual Text and code </td> <td>128k </td> <td>Yes </td> </tr> </table> **Supported languages:** English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai. **Llama 3.1 family of models**. Token counts refer to pretraining data only. All model versions use Grouped-Query Attention (GQA) for improved inference scalability. **Model Release Date:** July 23, 2024. **Status:** This is a static model trained on an offline dataset. Future versions of the tuned models will be released as we improve model safety with community feedback. **License:** A custom commercial license, the Llama 3.1 Community License, is available at: Where to send questions or comments about the model Instructions on how to provide feedback or comments on the model can be found in the model README. For more technical information about generation parameters and recipes for how to use Llama 3.1 in applications, please go here. ## Intended Use **Intended Use Cases** Llama 3.1 is intended for commercial and research use in multiple languages. Instruction tuned text only models are intended for assistant-like chat, whereas pretrained models can be adapted for a variety of natural language generation tasks. The Llama 3.1 model collection also supports the ability to leverage the outputs of its models to improve other models including synthetic data generation and distillation. The Llama 3.1 Community License allows for these use cases. **Out-of-scope** Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in any other way that is prohibited by the Acceptable Use Policy and Llama 3.1 Community License. Use in languages beyond those explicitly referenced as supported in this model card**. **<span style=\"text-decoration:underline;\">Note</span>: Llama 3.1 has been trained on a broader collection of languages than the 8 supported languages. Developers may fine-tune Llama 3.1 models for languages beyond the 8 supported languages provided they comply with the Llama 3.1 Community License and the Acceptable Use Policy and in such cases are responsible for ensuring that any uses of Llama 3.1 in additional languages is done in a safe and responsible manner. ## How to use This repository contains two versions of Meta-Llama-3.1-8B-Instruct, for use with transformers and with the original codebase. ### Use with transformers Starting with onward, you can run conversational inference using the Transformers abstraction or by leveraging the Auto classes with the function. Make sure to update your transformers installation via . Note: You can also find detailed recipes on how to use the model locally, with , assisted generations, quantised and more at []( ### Use with Please, follow the instructions in the repository To download Original checkpoints, see the example command below leveraging : ## Hardware and Software **Training Factors** We used custom training libraries, Meta's custom built GPU cluster, and production infrastructure for pretraining. Fine-tuning, annotation, and evaluation were also performed on production infrastructure. **Training utilized a cumulative of** 39.3M GPU hours of computation on H100-80GB (TDP of 700W) type hardware, per the table below. Training time is the total GPU time required for training each model and power consumption is the peak power capacity per GPU device used, adjusted for power usage efficiency. **Training Greenhouse Gas Emissions** Estimated total location-based greenhouse gas emissions were **11,390** tons CO2eq for training. Since 2020, Meta has maintained net zero greenhouse gas emissions in its global operations and matched 100% of its electricity use with renewable energy, therefore the total market-based greenhouse gas emissions for training were 0 tons CO2eq. <table> <tr> <td> </td> <td><strong>Training Time (GPU hours)</strong> </td> <td><strong>Training Power Consumption (W)</strong> </td> <td><strong>Training Location-Based Greenhouse Gas Emissions</strong> <p> <strong>(tons CO2eq)</strong> </td> <td><strong>Training Market-Based Greenhouse Gas Emissions</strong> <p> <strong>(tons CO2eq)</strong> </td> </tr> <tr> <td>Llama 3.1 8B </td> <td>1.46M </td> <td>700 </td> <td>420 </td> <td>0 </td> </tr> <tr> <td>Llama 3.1 70B </td> <td>7.0M </td> <td>700 </td> <td>2,040 </td> <td>0 </td> </tr> <tr> <td>Llama 3.1 405B </td> <td>30.84M </td> <td>700 </td> <td>8,930 </td> <td>0 </td> </tr> <tr> <td>Total </td> <td>39.3M <td> <ul> </ul> </td> <td>11,390 </td> <td>0 </td> </tr> </table> The methodology used to determine training energy use and greenhouse gas emissions can be found here. Since Meta is openly releasing these models, the training energy use and greenhouse gas emissions will not be incurred by others. ## Training Data **Overview:** Llama 3.1 was pretrained on ~15 trillion tokens of data from publicly available sources. The fine-tuning data includes publicly available instruction datasets, as well as over 25M synthetically generated examples. **Data Freshness:** The pretraining data has a cutoff of December 2023. ## Benchmark scores In this section, we report the results for Llama 3.1 models on standard automatic benchmarks. For all the evaluations, we use our internal evaluations library. ### Base pretrained models <table> <tr> <td><strong>Category</strong> </td> <td><strong>Benchmark</strong> </td> <td><strong># Shots</strong> </td> <td><strong>Metric</strong> </td> <td><strong>Llama 3 8B</strong> </td> <td><strong>Llama 3.1 8B</strong> </td> <td><strong>Llama 3 70B</strong> </td> <td><strong>Llama 3.1 70B</strong> </td> <td><strong>Llama 3.1 405B</strong> </td> </tr> <tr> <td rowspan=\"7\" >General </td> <td>MMLU </td> <td>5 </td> <td>macro_avg/acc_char </td> <td>66.7 </td> <td>66.7 </td> <td>79.5 </td> <td>79.3 </td> <td>85.2 </td> </tr> <tr> <td>MMLU-Pro (CoT) </td> <td>5 </td> <td>macro_avg/acc_char </td> <td>36.2 </td> <td>37.1 </td> <td>55.0 </td> <td>53.8 </td> <td>61.6 </td> </tr> <tr> <td>AGIEval English </td> <td>3-5 </td> <td>average/acc_char </td> <td>47.1 </td> <td>47.8 </td> <td>63.0 </td> <td>64.6 </td> <td>71.6 </td> </tr> <tr> <td>CommonSenseQA </td> <td>7 </td> <td>acc_char </td> <td>72.6 </td> <td>75.0 </td> <td>83.8 </td> <td>84.1 </td> <td>85.8 </td> </tr> <tr> <td>Winogrande </td> <td>5 </td> <td>acc_char </td> <td>- </td> <td>60.5 </td> <td>- </td> <td>83.3 </td> <td>86.7 </td> </tr> <tr> <td>BIG-Bench Hard (CoT) </td> <td>3 </td> <td>average/em </td> <td>61.1 </td> <td>64.2 </td> <td>81.3 </td> <td>81.6 </td> <td>85.9 </td> </tr> <tr> <td>ARC-Challenge </td> <td>25 </td> <td>acc_char </td> <td>79.4 </td> <td>79.7 </td> <td>93.1 </td> <td>92.9 </td> <td>96.1 </td> </tr> <tr> <td>Knowledge reasoning </td> <td>TriviaQA-Wiki </td> <td>5 </td> <td>em </td> <td>78.5 </td> <td>77.6 </td> <td>89.7 </td> <td>89.8 </td> <td>91.8 </td> </tr> <tr> <td rowspan=\"4\" >Reading comprehension </td> <td>SQuAD </td> <td>1 </td> <td>em </td> <td>76.4 </td> <td>77.0 </td> <td>85.6 </td> <td>81.8 </td> <td>89.3 </td> </tr> <tr> <td>QuAC (F1) </td> <td>1 </td> <td>f1 </td> <td>44.4 </td> <td>44.9 </td> <td>51.1 </td> <td>51.1 </td> <td>53.6 </td> </tr> <tr> <td>BoolQ </td> <td>0 </td> <td>acc_char </td> <td>75.7 </td> <td>75.0 </td> <td>79.0 </td> <td>79.4 </td> <td>80.0 </td> </tr> <tr> <td>DROP (F1) </td> <td>3 </td> <td>f1 </td> <td>58.4 </td> <td>59.5 </td> <td>79.7 </td> <td>79.6 </td> <td>84.8 </td> </tr> </table> ### Instruction tuned models <table> <tr> <td><strong>Category</strong> </td> <td><strong>Benchmark</strong> </td> <td><strong># Shots</strong> </td> <td><strong>Metric</strong> </td> <td><strong>Llama 3 8B Instruct</strong> </td> <td><strong>Llama 3.1 8B Instruct</strong> </td> <td><strong>Llama 3 70B Instruct</strong> </td> <td><strong>Llama 3.1 70B Instruct</strong> </td> <td><strong>Llama 3.1 405B Instruct</strong> </td> </tr> <tr> <td rowspan=\"4\" >General </td> <td>MMLU </td> <td>5 </td> <td>macro_avg/acc </td> <td>68.5 </td> <td>69.4 </td> <td>82.0 </td> <td>83.6 </td> <td>87.3 </td> </tr> <tr> <td>MMLU (CoT) </td> <td>0 </td> <td>macro_avg/acc </td> <td>65.3 </td> <td>73.0 </td> <td>80.9 </td> <td>86.0 </td> <td>88.6 </td> </tr> <tr> <td>MMLU-Pro (CoT) </td> <td>5 </td> <td>micro_avg/acc_char </td> <td>45.5 </td> <td>48.3 </td> <td>63.4 </td> <td>66.4 </td> <td>73.3 </td> </tr> <tr> <td>IFEval </td> <td> </td> <td> </td> <td>76.8 </td> <td>80.4 </td> <td>82.9 </td> <td>87.5 </td> <td>88.6 </td> </tr> <tr> <td rowspan=\"2\" >Reasoning </td> <td>ARC-C </td> <td>0 </td> <td>acc </td> <td>82.4 </td> <td>83.4 </td> <td>94.4 </td> <td>94.8 </td> <td>96.9 </td> </tr> <tr> <td>GPQA </td> <td>0 </td> <td>em </td> <td>34.6 </td> <td>30.4 </td> <td>39.5 </td> <td>41.7 </td> <td>50.7 </td> </tr> <tr> <td rowspan=\"4\" >Code </td> <td>HumanEval </td> <td>0 </td> <td>pass@1 </td> <td>60.4 </td> <td>72.6 </td> <td>81.7 </td> <td>80.5 </td> <td>89.0 </td> </tr> <tr> <td>MBPP ++ base version </td> <td>0 </td> <td>pass@1 </td> <td>70.6 </td> <td>72.8 </td> <td>82.5 </td> <td>86.0 </td> <td>88.6 </td> </tr> <tr> <td>Multipl-E HumanEval </td> <td>0 </td> <td>pass@1 </td> <td>- </td> <td>50.8 </td> <td>- </td> <td>65.5 </td> <td>75.2 </td> </tr> <tr> <td>Multipl-E MBPP </td> <td>0 </td> <td>pass@1 </td> <td>- </td> <td>52.4 </td> <td>- </td> <td>62.0 </td> <td>65.7 </td> </tr> <tr> <td rowspan=\"2\" >Math </td> <td>GSM-8K (CoT) </td> <td>8 </td> <td>em_maj1@1 </td> <td>80.6 </td> <td>84.5 </td> <td>93.0 </td> <td>95.1 </td> <td>96.8 </td> </tr> <tr> <td>MATH (CoT) </td> <td>0 </td> <td>final_em </td> <td>29.1 </td> <td>51.9 </td> <td>51.0 </td> <td>68.0 </td> <td>73.8 </td> </tr> <tr> <td rowspan=\"4\" >Tool Use </td> <td>API-Bank </td> <td>0 </td> <td>acc </td> <td>48.3 </td> <td>82.6 </td> <td>85.1 </td> <td>90.0 </td> <td>92.0 </td> </tr> <tr> <td>BFCL </td> <td>0 </td> <td>acc </td> <td>60.3 </td> <td>76.1 </td> <td>83.0 </td> <td>84.8 </td> <td>88.5 </td> </tr> <tr> <td>Gorilla Benchmark API Bench </td> <td>0 </td> <td>acc </td> <td>1.7 </td> <td>8.2 </td> <td>14.7 </td> <td>29.7 </td> <td>35.3 </td> </tr> <tr> <td>Nexus (0-shot) </td> <td>0 </td> <td>macro_avg/acc </td> <td>18.1 </td> <td>38.5 </td> <td>47.8 </td> <td>56.7 </td> <td>58.7 </td> </tr> <tr> <td>Multilingual </td> <td>Multilingual MGSM (CoT) </td> <td>0 </td> <td>em </td> <td>- </td> <td>68.9 </td> <td>- </td> <td>86.9 </td> <td>91.6 </td> </tr> </table> #### Multilingual benchmarks <table> <tr> <td><strong>Category</strong> </td> <td><strong>Benchmark</strong> </td> <td><strong>Language</strong> </td> <td><strong>Llama 3.1 8B</strong> </td> <td><strong>Llama 3.1 70B</strong> </td> <td><strong>Llama 3.1 405B</strong> </td> </tr> <tr> <td rowspan=\"9\" ><strong>General</strong> </td> <td rowspan=\"9\" ><strong>MMLU (5-shot, macro_avg/acc)</strong> </td> <td>Portuguese </td> <td>62.12 </td> <td>80.13 </td> <td>84.95 </td> </tr> <tr> <td>Spanish </td> <td>62.45 </td> <td>80.05 </td> <td>85.08 </td> </tr> <tr> <td>Italian </td> <td>61.63 </td> <td>80.4 </td> <td>85.04 </td> </tr> <tr> <td>German </td> <td>60.59 </td> <td>79.27 </td> <td>84.36 </td> </tr> <tr> <td>French </td> <td>62.34 </td> <td>79.82 </td> <td>84.66 </td> </tr> <tr> <td>Hindi </td> <td>50.88 </td> <td>74.52 </td> <td>80.31 </td> </tr> <tr> <td>Thai </td> <td>50.32 </td> <td>72.95 </td> <td>78.21 </td> </tr> </table> ## Responsibility & Safety As part of our Responsible release approach, we followed a three-pronged strategy to managing trust & safety risks: * Enable developers to deploy helpful, safe and flexible experiences for their target audience and for the use cases supported by Llama. * Protect developers against adversarial users aiming to exploit Llama capabilities to potentially cause harm. * Provide protections for the community to help prevent the misuse of our models. ### Responsible deployment Llama is a foundational technology designed to be used in a variety of use cases, examples on how Meta’s Llama models have been responsibly deployed can be found in our Community Stories webpage. Our approach is to build the most helpful models enabling the world to benefit from the technology power, by aligning our model safety for the generic use cases addressing a standard set of harms. Developers are then in the driver seat to tailor safety for their use case, defining their own policy and deploying the models with the necessary safeguards in their Llama systems. Llama 3.1 was developed following the best practices outlined in our Responsible Use Guide, you can refer to the Responsible Use Guide to learn more. #### Llama 3.1 instruct Our main objectives for conducting safety fine-tuning are to provide the research community with a valuable resource for studying the robustness of safety fine-tuning, as well as to offer developers a readily available, safe, and powerful model for various applications to reduce the developer workload to deploy safe AI systems. For more details on the safety mitigations implemented please read the Llama 3 paper. **Fine-tuning data** We employ a multi-faceted approach to data collection, combining human-generated data from our vendors with synthetic data to mitigate potential safety risks. We’ve developed many large language model (LLM)-based classifiers that enable us to thoughtfully select high-quality prompts and responses, enhancing data quality control. **Refusals and Tone** Building on the work we started with Llama 3, we put a great emphasis on model refusals to benign prompts as well as refusal tone. We included both borderline and adversarial prompts in our safety data strategy, and modified our safety data responses to follow tone guidelines. #### Llama 3.1 systems **Large language models, including Llama 3.1, are not designed to be deployed in isolation but instead should be deployed as part of an overall AI system with additional safety guardrails as required.** Developers are expected to deploy system safeguards when building agentic systems. Safeguards are key to achieve the right helpfulness-safety alignment as well as mitigating safety and security risks inherent to the system and any integration of the model or system with external tools. As part of our responsible release approach, we provide the community with safeguards that developers should deploy with Llama models or other LLMs, including Llama Guard 3, Prompt Guard and Code Shield. All our reference implementations demos contain these safeguards by default so developers can benefit from system-level safety out-of-the-box. #### New capabilities Note that this release introduces new capabilities, including a longer context window, multilingual inputs and outputs and possible integrations by developers with third party tools. Building with these new capabilities requires specific considerations in addition to the best practices that generally apply across all Generative AI use cases. **Tool-use**: Just like in standard software development, developers are responsible for the integration of the LLM with the tools and services of their choice. They should define a clear policy for their use case and assess the integrity of the third party services they use to be aware of the safety and security limitations when using this capability. Refer to the Responsible Use Guide for best practices on the safe deployment of the third party safeguards. **Multilinguality**: Llama 3.1 supports 7 languages in addition to English: French, German, Hindi, Italian, Portuguese, Spanish, and Thai. Llama may be able to output text in other languages than those that meet performance thresholds for safety and helpfulness. We strongly discourage developers from using this model to converse in non-supported languages without implementing finetuning and system controls in alignment with their policies and the best practices shared in the Responsible Use Guide. ### Evaluations We evaluated Llama models for common use cases as well as specific capabilities. Common use cases evaluations measure safety risks of systems for most commonly built applications including chat bot, coding assistant, tool calls. We built dedicated, adversarial evaluation datasets and evaluated systems composed of Llama models and Llama Guard 3 to filter input prompt and output response. It is important to evaluate applications in context, and we recommend building dedicated evaluation dataset for your use case. Prompt Guard and Code Shield are also available if relevant to the application. Capability evaluations measure vulnerabilities of Llama models inherent to specific capabilities, for which were crafted dedicated benchmarks including long context, multilingual, tools calls, coding or memorization. **Red teaming** For both scenarios, we conducted recurring red teaming exercises with the goal of discovering risks via adversarial prompting and we used the learnings to improve our benchmarks and safety tuning datasets. We partnered early with subject-matter experts in critical risk areas to understand the nature of these real-world harms and how such models may lead to unintended harm for society. Based on these conversations, we derived a set of adversarial goals for the red team to attempt to achieve, such as extracting harmful information or reprogramming the model to act in a potentially harmful capacity. The red team consisted of experts in cybersecurity, adversarial machine learning, responsible AI, and integrity in addition to multilingual content specialists with background in integrity issues in specific geographic markets. ### Critical and other risks We specifically focused our efforts on mitigating the following critical risk areas: **1- CBRNE (Chemical, Biological, Radiological, Nuclear, and Explosive materials) helpfulness** To assess risks related to proliferation of chemical and biological weapons, we performed uplift testing designed to assess whether use of Llama 3.1 models could meaningfully increase the capabilities of malicious actors to plan or carry out attacks using these types of weapons. **2. Child Safety** Child Safety risk assessments were conducted using a team of experts, to assess the model’s capability to produce outputs that could result in Child Safety risks and inform on any necessary and appropriate risk mitigations via fine tuning. We leveraged those expert red teaming sessions to expand the coverage of our evaluation benchmarks through Llama 3 model development. For Llama 3, we conducted new in-depth sessions using objective based methodologies to assess the model risks along multiple attack vectors including the additional languages Llama 3 is trained on. We also partnered with content specialists to perform red teaming exercises assessing potentially violating content while taking account of market specific nuances or experiences. **3. Cyber attack enablement** Our cyber attack uplift study investigated whether LLMs can enhance human capabilities in hacking tasks, both in terms of skill level and speed. Our attack automation study focused on evaluating the capabilities of LLMs when used as autonomous agents in cyber offensive operations, specifically in the context of ransomware attacks. This evaluation was distinct from previous studies that considered LLMs as interactive assistants. The primary objective was to assess whether these models could effectively function as independent agents in executing complex cyber-attacks without human intervention. Our study of Llama-3.1-405B’s social engineering uplift for cyber attackers was conducted to assess the effectiveness of AI models in aiding cyber threat actors in spear phishing campaigns. Please read our Llama 3.1 Cyber security whitepaper to learn more. ### Community Generative AI safety requires expertise and tooling, and we believe in the strength of the open community to accelerate its progress. We are active members of open consortiums, including the AI Alliance, Partnership on AI and MLCommons, actively contributing to safety standardization and transparency. We encourage the community to adopt taxonomies like the MLCommons Proof of Concept evaluation to facilitate collaboration and transparency on safety and content evaluations. Our Purple Llama tools are open sourced for the community to use and widely distributed across ecosystem partners including cloud service providers. We encourage community contributions to our Github repository. We also set up the Llama Impact Grants program to identify and support the most compelling applications of Meta’s Llama model for societal benefit across three categories: education, climate and open innovation. The 20 finalists from the hundreds of applications can be found here. Finally, we put in place a set of resources including an output reporting mechanism and bug bounty program to continuously improve the Llama technology with the help of the community. ## Ethical Considerations and Limitations The core values of Llama 3.1 are openness, inclusivity and helpfulness. It is meant to serve everyone, and to work for a wide range of use cases. It is thus designed to be accessible to people across many different backgrounds, experiences and perspectives. Llama 3.1 addresses users and their needs as they are, without insertion unnecessary judgment or normativity, while reflecting the understanding that even content that may appear problematic in some cases can serve valuable purposes in others. It respects the dignity and autonomy of all users, especially in terms of the values of free thought and expression that power innovation and progress. But Llama 3.1 is a new technology, and like any new technology, there are risks associated with its use. Testing conducted to date has not covered, nor could it cover, all scenarios. For these reasons, as with all LLMs, Llama 3.1’s potential outputs cannot be predicted in advance, and the model may in some instances produce inaccurate, biased or other objectionable responses to user prompts. Therefore, before deploying any applications of Llama 3.1 models, developers should perform safety testing and tuning tailored to their specific applications of the model. Please refer to available resources including our Responsible Use Guide, Trust and Safety solutions, and other resources to learn more about responsible development.",
27
+ "model_explanation_gemini": "A quantized version of Meta's Llama-3.1-8B-Instruct model optimized for efficient inference, supporting multilingual text generation.\n\n**Features:** \n- Multilingual support (English, German, French, Italian, Portuguese, Hindi, Spanish, Thai) \n- Text generation capability \n- Quantized GGUF format for efficiency \n- Derived from Meta's Llama 3.1 architecture \n- Community license with redistribution terms \n\n**Comparison:** \nThis model is a quantized variant"
28
+ }
model_data_json/Qwen_QwQ-32B-GGUF.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/QwQ-32B-GGUF",
3
+ "downloads": 68965,
4
+ "tags": [
5
+ "gguf",
6
+ "chat",
7
+ "text-generation",
8
+ "en",
9
+ "arxiv:2309.00071",
10
+ "arxiv:2412.15115",
11
+ "base_model:Qwen/QwQ-32B",
12
+ "base_model:quantized:Qwen/QwQ-32B",
13
+ "license:apache-2.0",
14
+ "endpoints_compatible",
15
+ "region:us",
16
+ "conversational"
17
+ ],
18
+ "description": "--- license: apache-2.0 license_link: language: - en pipeline_tag: text-generation base_model: Qwen/QwQ-32B tags: - chat --- # QwQ-32B-GGUF <a href=\" target=\"_blank\" style=\"margin: 2px;\"> <img alt=\"Chat\" src=\" style=\"display: inline-block; vertical-align: middle;\"/> </a> ## Introduction QwQ is the reasoning model of the Qwen series. Compared with conventional instruction-tuned models, QwQ, which is capable of thinking and reasoning, can achieve significantly enhanced performance in downstream tasks, especially hard problems. QwQ-32B is the medium-sized reasoning model, which is capable of achieving competitive performance against state-of-the-art reasoning models, e.g., DeepSeek-R1, o1-mini. <p align=\"center\"> <img width=\"100%\" src=\"figures/benchmark.jpg\"> </p> **This repo contains the QwQ 32B model in the GGUF Format**, which has the following features: - Type: Causal Language Models - Training Stage: Pretraining & Post-training (Supervised Finetuning and Reinforcement Learning) - Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias - Number of Parameters: 32.5B - Number of Paramaters (Non-Embedding): 31.0B - Number of Layers: 64 - Number of Attention Heads (GQA): 40 for Q and 8 for KV - Context Length: Full 131,072 tokens - Quantization: q4_K_M, q5_0, q5_K_M, q6_K, q8_0 **Note:** For the best experience, please review the usage guidelines before deploying QwQ models. You can try our demo or access QwQ models via QwenChat. For more details, please refer to our blog, GitHub, and Documentation. ## Requirements QwQ is based on Qwen2.5, whose code has been in the latest Hugging face . We advise you to use the latest version of . With , you will encounter the following error: Also check out our AWQ documentation for more usage guide. ## Quickstart heck out our llama.cpp documentation for more usage guide. We advise you to clone []( and install it following the official guide. We follow the latest version of llama.cpp. In the following demonstration, we assume that you are running commands under the repository . You can use the following commands for inference: ### Usage Guidelines To achieve optimal performance, we recommend the following settings: 1. **Enforce Thoughtful Output**: Ensure the model starts with \"\\<think\\>\\n\" to prevent generating empty thinking content, which can degrade output quality. 2. **Sampling Parameters**: - Use Temperature=0.6, TopP=0.95, MinP=0 instead of Greedy decoding to avoid endless repetitions. - Use TopK between 20 and 40 to filter out rare token occurrences while maintaining the diversity of the generated output. - For supported frameworks, you can adjust the parameter between 0 and 2 to reduce endless repetitions. However, using a higher value may result in occasional language mixing and a slight decrease in performance. 3. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. This feature is already implemented in . 4. **Standardize Output Format**: We recommend using prompts to standardize model outputs when benchmarking. - **Math Problems**: Include \"Please reason step by step, and put your final answer within \\boxed{}.\" in the prompt. - **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: \"Please show your choice in the field with only the choice letter, e.g.,.\" in the prompt. 5. **Handle Long Inputs**: For inputs exceeding 32,768 tokens, enable YaRN to improve the model's ability to capture long-sequence information effectively. Currently, only vLLM supports YARN for length extrapolating. If you want to process sequences up to 131,072 tokens, please refer to non-GGUF models. 6. **Other References**: You can also consult Unsloth's Guide to see if their approach meets your needs. (Thanks to the Unsloth team!) ## Evaluation & Performance Detailed evaluation results are reported in this 📑 blog. For requirements on GPU memory and the respective throughput, see results here. ## Citation If you find our work helpful, feel free to give us a cite.",
19
+ "model_explanation_gemini": "A 32.5B-parameter reasoning-focused language model optimized for enhanced performance on complex tasks through structured thinking and step-by-step problem-solving, supporting long-context (131K tokens) and GGUF quantization. \n\n**Features:** \n- **Type:** Causal language model \n- **Architecture:** Transformers with RoPE, SwiGLU, RMSNorm, Attention QKV bias \n- **Training:** Pretraining + supervised finetuning + RL \n- **Context Length:**"
20
+ }
model_data_json/Qwen_Qwen2-VL-2B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2-VL-2B",
3
+ "downloads": 77369,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen2_vl",
8
+ "image-text-to-text",
9
+ "multimodal",
10
+ "conversational",
11
+ "en",
12
+ "arxiv:2409.12191",
13
+ "arxiv:2308.12966",
14
+ "license:apache-2.0",
15
+ "text-generation-inference",
16
+ "endpoints_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: apache-2.0 language: - en pipeline_tag: image-text-to-text tags: - multimodal library_name: transformers --- # Qwen2-VL-2B ## Introduction We're excited to unveil **Qwen2-VL**, the latest iteration of our Qwen-VL model, representing nearly a year of innovation. > [!Important] > This is the base pretrained model of Qwen2-VL-2B without instruction tuning. ### What’s New in Qwen2-VL? #### Key Enhancements: * **SoTA understanding of images of various resolution & ratio**: Qwen2-VL achieves state-of-the-art performance on visual understanding benchmarks, including MathVista, DocVQA, RealWorldQA, MTVQA, etc. * **Understanding videos of 20min+**: Qwen2-VL can understand videos over 20 minutes for high-quality video-based question answering, dialog, content creation, etc. * **Agent that can operate your mobiles, robots, etc.**: with the abilities of complex reasoning and decision making, Qwen2-VL can be integrated with devices like mobile phones, robots, etc., for automatic operation based on visual environment and text instructions. * **Multilingual Support**: to serve global users, besides English and Chinese, Qwen2-VL now supports the understanding of texts in different languages inside images, including most European languages, Japanese, Korean, Arabic, Vietnamese, etc. #### Model Architecture Updates: * **Naive Dynamic Resolution**: Unlike before, Qwen2-VL can handle arbitrary image resolutions, mapping them into a dynamic number of visual tokens, offering a more human-like visual processing experience. <p align=\"center\"> <img src=\" width=\"80%\"/> <p> * **Multimodal Rotary Position Embedding (M-ROPE)**: Decomposes positional embedding into parts to capture 1D textual, 2D visual, and 3D video positional information, enhancing its multimodal processing capabilities. <p align=\"center\"> <img src=\" width=\"80%\"/> <p> We have three models with 2, 7 and 72 billion parameters. This repo contains the **pretrained** 2B Qwen2-VL model. For more information, visit our Blog and GitHub. ## Requirements The code of Qwen2-VL has been in the latest Hugging Face and we advise you to install the latest version with command , or you might encounter the following error: ## Citation If you find our work helpful, feel free to give us a cite.",
20
+ "model_explanation_gemini": "\"Qwen2-VL-2B is a 2-billion-parameter multimodal pretrained model excelling in visual understanding (images, videos), multilingual text recognition, and device integration for tasks like question answering, content creation, and robotic control.\"\n\n**Features**: \n- State-of-the-art image understanding (variable resolution/ratio) \n- Video comprehension (20+ minutes) \n- Multilingual text recognition in images (European, Asian, Arabic languages) \n- Device integration for robotic/m"
21
+ }
model_data_json/Qwen_Qwen2-VL-72B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2-VL-72B",
3
+ "downloads": 77971,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen2_vl",
8
+ "image-text-to-text",
9
+ "multimodal",
10
+ "conversational",
11
+ "en",
12
+ "arxiv:2409.12191",
13
+ "arxiv:2308.12966",
14
+ "license:other",
15
+ "text-generation-inference",
16
+ "endpoints_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: other license_name: qwen license_link: language: - en pipeline_tag: image-text-to-text tags: - multimodal library_name: transformers --- # Qwen2-VL-72B ## Introduction We're excited to unveil **Qwen2-VL**, the latest iteration of our Qwen-VL model, representing nearly a year of innovation. > [!Important] > This is the base pretrained model of Qwen2-VL-72B without instruction tuning. ### What’s New in Qwen2-VL? #### Key Enhancements: * **SoTA understanding of images of various resolution & ratio**: Qwen2-VL achieves state-of-the-art performance on visual understanding benchmarks, including MathVista, DocVQA, RealWorldQA, MTVQA, etc. * **Understanding videos of 20min+**: Qwen2-VL can understand videos over 20 minutes for high-quality video-based question answering, dialog, content creation, etc. * **Agent that can operate your mobiles, robots, etc.**: with the abilities of complex reasoning and decision making, Qwen2-VL can be integrated with devices like mobile phones, robots, etc., for automatic operation based on visual environment and text instructions. * **Multilingual Support**: to serve global users, besides English and Chinese, Qwen2-VL now supports the understanding of texts in different languages inside images, including most European languages, Japanese, Korean, Arabic, Vietnamese, etc. #### Model Architecture Updates: * **Naive Dynamic Resolution**: Unlike before, Qwen2-VL can handle arbitrary image resolutions, mapping them into a dynamic number of visual tokens, offering a more human-like visual processing experience. <p align=\"center\"> <img src=\" width=\"80%\"/> <p> * **Multimodal Rotary Position Embedding (M-ROPE)**: Decomposes positional embedding into parts to capture 1D textual, 2D visual, and 3D video positional information, enhancing its multimodal processing capabilities. <p align=\"center\"> <img src=\" width=\"80%\"/> <p> We have three models with 2, 7 and 72 billion parameters. This repo contains the **pretrained** 72B Qwen2-VL model. For more information, visit our Blog and GitHub. ## Requirements The code of Qwen2-VL has been in the latest Hugging Face and we advise you to install the latest version with command , or you might encounter the following error: ## Citation If you find our work helpful, feel free to give us a cite.",
20
+ "model_explanation_gemini": "\"Qwen2-VL-72B is a multimodal, pretrained base model excelling in high-resolution image understanding, long-video comprehension (20+ minutes), multilingual text recognition in images, and device operation via visual reasoning, featuring dynamic resolution handling and enhanced multimodal processing.\"\n\n**Features:** \n1. State-of-the-art visual understanding (images, videos, documents). \n2. Supports arbitrary image resolutions via dynamic token mapping. \n3. Processes 20+ minute videos for QA/dialog"
21
+ }
model_data_json/Qwen_Qwen2.5-32B-Instruct-AWQ.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2.5-32B-Instruct-AWQ",
3
+ "downloads": 70467,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen2",
8
+ "text-generation",
9
+ "chat",
10
+ "conversational",
11
+ "en",
12
+ "arxiv:2309.00071",
13
+ "arxiv:2407.10671",
14
+ "base_model:Qwen/Qwen2.5-32B-Instruct",
15
+ "base_model:quantized:Qwen/Qwen2.5-32B-Instruct",
16
+ "license:apache-2.0",
17
+ "autotrain_compatible",
18
+ "text-generation-inference",
19
+ "endpoints_compatible",
20
+ "4-bit",
21
+ "awq",
22
+ "region:us"
23
+ ],
24
+ "description": "--- base_model: Qwen/Qwen2.5-32B-Instruct language: - en library_name: transformers license: apache-2.0 license_link: pipeline_tag: text-generation tags: - chat --- # Qwen2.5-32B-Instruct-AWQ ## Introduction Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. Qwen2.5 brings the following improvements upon Qwen2: - Significantly **more knowledge** and has greatly improved capabilities in **coding** and **mathematics**, thanks to our specialized expert models in these domains. - Significant improvements in **instruction following**, **generating long texts** (over 8K tokens), **understanding structured data** (e.g, tables), and **generating structured outputs** especially JSON. **More resilient to the diversity of system prompts**, enhancing role-play implementation and condition-setting for chatbots. - **Long-context Support** up to 128K tokens and can generate up to 8K tokens. - **Multilingual support** for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more. **This repo contains the AWQ-quantized 4-bit instruction-tuned 32B Qwen2.5 model**, which has the following features: - Type: Causal Language Models - Training Stage: Pretraining & Post-training - Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias - Number of Parameters: 32.5B - Number of Paramaters (Non-Embedding): 31.0B - Number of Layers: 64 - Number of Attention Heads (GQA): 40 for Q and 8 for KV - Context Length: Full 131,072 tokens and generation 8192 tokens - Please refer to this section for detailed instructions on how to deploy Qwen2.5 for handling long texts. - Quantization: AWQ 4-bit For more details, please refer to our blog, GitHub, and Documentation. ## Requirements The code of Qwen2.5 has been in the latest Hugging face and we advise you to use the latest version of . With , you will encounter the following error: Also check out our AWQ documentation for more usage guide. ## Quickstart Here provides a code snippet with to show you how to load the tokenizer and model and how to generate contents. ### Processing Long Texts The current is set for context length up to 32,768 tokens. To handle extensive inputs exceeding 32,768 tokens, we utilize YaRN, a technique for enhancing model length extrapolation, ensuring optimal performance on lengthy texts. For supported frameworks, you could add the following to to enable YaRN: For deployment, we recommend using vLLM. Please refer to our Documentation for usage if you are not familar with vLLM. Presently, vLLM only supports static YARN, which means the scaling factor remains constant regardless of input length, **potentially impacting performance on shorter texts**. We advise adding the configuration only when processing long contexts is required. ## Evaluation & Performance Detailed evaluation results are reported in this 📑 blog. For quantized models, the benchmark results against the original bfloat16 models can be found here For requirements on GPU memory and the respective throughput, see results here. ## Citation If you find our work helpful, feel free to give us a cite.",
25
+ "model_explanation_gemini": "A 4-bit AWQ-quantized, 32.5B-parameter instruction-tuned causal language model optimized for text generation, featuring enhanced coding, mathematics, multilingual support (29+ languages), structured data handling, and long-context processing (128K tokens). \n\n**Features:** \n- **Core Task:** Text generation (chat, instruction following) \n- **Key Improvements:** Better coding/math, structured output (JSON), long-text generation (8K tokens), multilingual support"
26
+ }
model_data_json/Qwen_Qwen2.5-32B.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2.5-32B",
3
+ "downloads": 76454,
4
+ "tags": [
5
+ "safetensors",
6
+ "qwen2",
7
+ "text-generation",
8
+ "conversational",
9
+ "en",
10
+ "arxiv:2407.10671",
11
+ "license:apache-2.0",
12
+ "region:us"
13
+ ],
14
+ "description": "--- license: apache-2.0 license_link: language: - en pipeline_tag: text-generation --- # Qwen2.5-32B ## Introduction Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. Qwen2.5 brings the following improvements upon Qwen2: - Significantly **more knowledge** and has greatly improved capabilities in **coding** and **mathematics**, thanks to our specialized expert models in these domains. - Significant improvements in **instruction following**, **generating long texts** (over 8K tokens), **understanding structured data** (e.g, tables), and **generating structured outputs** especially JSON. **More resilient to the diversity of system prompts**, enhancing role-play implementation and condition-setting for chatbots. - **Long-context Support** up to 128K tokens and can generate up to 8K tokens. - **Multilingual support** for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more. **This repo contains the base 32B Qwen2.5 model**, which has the following features: - Type: Causal Language Models - Training Stage: Pretraining - Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias - Number of Parameters: 32.5B - Number of Paramaters (Non-Embedding): 31.0B - Number of Layers: 64 - Number of Attention Heads (GQA): 40 for Q and 8 for KV - Context Length: 131,072 tokens **We do not recommend using base language models for conversations.** Instead, you can apply post-training, e.g., SFT, RLHF, continued pretraining, etc., on this model. For more details, please refer to our blog, GitHub, and Documentation. ## Requirements The code of Qwen2.5 has been in the latest Hugging face and we advise you to use the latest version of . With , you will encounter the following error: ## Evaluation & Performance Detailed evaluation results are reported in this 📑 blog. For requirements on GPU memory and the respective throughput, see results here. ## Citation If you find our work helpful, feel free to give us a cite.",
15
+ "model_explanation_gemini": "\"Qwen2.5-32B is a 32.5B-parameter causal language model optimized for text generation, featuring enhanced coding, mathematics, multilingual support (29+ languages), long-context handling (128K tokens), structured data understanding, and improved instruction following compared to Qwen2.\"\n\n### **Model Features:** \n- **Type:** Causal Language Model \n- **Architecture:** Transformers with RoPE, SwiGLU, RMSNorm, GQA (40"
16
+ }
model_data_json/Qwen_Qwen2.5-Math-7B-Instruct.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2.5-Math-7B-Instruct",
3
+ "downloads": 71240,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen2",
8
+ "text-generation",
9
+ "chat",
10
+ "conversational",
11
+ "en",
12
+ "arxiv:2409.12122",
13
+ "base_model:Qwen/Qwen2.5-Math-7B",
14
+ "base_model:finetune:Qwen/Qwen2.5-Math-7B",
15
+ "license:apache-2.0",
16
+ "autotrain_compatible",
17
+ "text-generation-inference",
18
+ "endpoints_compatible",
19
+ "region:us"
20
+ ],
21
+ "description": "--- base_model: Qwen/Qwen2.5-Math-7B language: - en pipeline_tag: text-generation tags: - chat library_name: transformers license: apache-2.0 license_link: --- # Qwen2.5-Math-7B-Instruct > [!Warning] > <div align=\"center\"> > <b> > 🚨 Qwen2.5-Math mainly supports solving English and Chinese math problems through CoT and TIR. We do not recommend using this series of models for other tasks. > </b> > </div> ## Introduction In August 2024, we released the first series of mathematical LLMs - Qwen2-Math - of our Qwen family. A month later, we have upgraded it and open-sourced **Qwen2.5-Math** series, including base models **Qwen2.5-Math-1.5B/7B/72B**, instruction-tuned models **Qwen2.5-Math-1.5B/7B/72B-Instruct**, and mathematical reward model **Qwen2.5-Math-RM-72B**. Unlike Qwen2-Math series which only supports using Chain-of-Thught (CoT) to solve English math problems, Qwen2.5-Math series is expanded to support using both CoT and Tool-integrated Reasoning (TIR) to solve math problems in both Chinese and English. The Qwen2.5-Math series models have achieved significant performance improvements compared to the Qwen2-Math series models on the Chinese and English mathematics benchmarks with CoT. ![]( While CoT plays a vital role in enhancing the reasoning capabilities of LLMs, it faces challenges in achieving computational accuracy and handling complex mathematical or algorithmic reasoning tasks, such as finding the roots of a quadratic equation or computing the eigenvalues of a matrix. TIR can further improve the model's proficiency in precise computation, symbolic manipulation, and algorithmic manipulation. Qwen2.5-Math-1.5B/7B/72B-Instruct achieve 79.7, 85.3, and 87.8 respectively on the MATH benchmark using TIR. ## Model Details For more details, please refer to our blog post and GitHub repo. ## Requirements * for Qwen2.5-Math models. The latest version is recommended. > [!Warning] > <div align=\"center\"> > <b> > 🚨 This is a must because <code>transformers</code> integrated Qwen2 codes since <code>4.37.0</code>. > </b> > </div> For requirements on GPU memory and the respective throughput, see similar results of Qwen2 here. ## Quick Start > [!Important] > > **Qwen2.5-Math-7B-Instruct** is an instruction model for chatting; > > **Qwen2.5-Math-7B** is a base model typically used for completion and few-shot inference, serving as a better starting point for fine-tuning. > ### 🤗 Hugging Face Transformers Qwen2.5-Math can be deployed and infered in the same way as Qwen2.5. Here we show a code snippet to show you how to use the chat model with : ## Citation If you find our work helpful, feel free to give us a citation.",
22
+ "model_explanation_gemini": "Specializes in solving English and Chinese math problems using Chain-of-Thought (CoT) and Tool-integrated Reasoning (TIR), with improved accuracy over its predecessor Qwen2-Math. \n\n**Features:** \n1. Supports English and Chinese math problem-solving. \n2. Utilizes CoT and TIR for enhanced reasoning and computational precision. \n3. Outperforms Qwen2-Math on Chinese/English benchmarks. \n4. Available in 1.5B,"
23
+ }
model_data_json/Qwen_Qwen2.5-VL-7B-Instruct-AWQ.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
3
+ "downloads": 78495,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen2_5_vl",
8
+ "image-text-to-text",
9
+ "multimodal",
10
+ "conversational",
11
+ "en",
12
+ "arxiv:2309.00071",
13
+ "arxiv:2409.12191",
14
+ "arxiv:2308.12966",
15
+ "base_model:Qwen/Qwen2.5-VL-7B-Instruct",
16
+ "base_model:quantized:Qwen/Qwen2.5-VL-7B-Instruct",
17
+ "license:apache-2.0",
18
+ "text-generation-inference",
19
+ "endpoints_compatible",
20
+ "4-bit",
21
+ "awq",
22
+ "region:us"
23
+ ],
24
+ "description": "--- license: apache-2.0 language: - en pipeline_tag: image-text-to-text tags: - multimodal library_name: transformers base_model: - Qwen/Qwen2.5-VL-7B-Instruct --- # Qwen2.5-VL-7B-Instruct-AWQ <a href=\" target=\"_blank\" style=\"margin: 2px;\"> <img alt=\"Chat\" src=\" style=\"display: inline-block; vertical-align: middle;\"/> </a> ## Introduction In the past five months since Qwen2-VL’s release, numerous developers have built new models on the Qwen2-VL vision-language models, providing us with valuable feedback. During this period, we focused on building more useful vision-language models. Today, we are excited to introduce the latest addition to the Qwen family: Qwen2.5-VL. #### Key Enhancements: * **Understand things visually**: Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, birds, fish, and insects, but it is highly capable of analyzing texts, charts, icons, graphics, and layouts within images. * **Being agentic**: Qwen2.5-VL directly plays as a visual agent that can reason and dynamically direct tools, which is capable of computer use and phone use. * **Understanding long videos and capturing events**: Qwen2.5-VL can comprehend videos of over 1 hour, and this time it has a new ability of cpaturing event by pinpointing the relevant video segments. * **Capable of visual localization in different formats**: Qwen2.5-VL can accurately localize objects in an image by generating bounding boxes or points, and it can provide stable JSON outputs for coordinates and attributes. * **Generating structured outputs**: for data like scans of invoices, forms, tables, etc. Qwen2.5-VL supports structured outputs of their contents, benefiting usages in finance, commerce, etc. #### Model Architecture Updates: * **Dynamic Resolution and Frame Rate Training for Video Understanding**: We extend dynamic resolution to the temporal dimension by adopting dynamic FPS sampling, enabling the model to comprehend videos at various sampling rates. Accordingly, we update mRoPE in the time dimension with IDs and absolute time alignment, enabling the model to learn temporal sequence and speed, and ultimately acquire the ability to pinpoint specific moments. <p align=\"center\"> <img src=\" width=\"80%\"/> <p> * **Streamlined and Efficient Vision Encoder** We enhance both training and inference speeds by strategically implementing window attention into the ViT. The ViT architecture is further optimized with SwiGLU and RMSNorm, aligning it with the structure of the Qwen2.5 LLM. We have three models with 3, 7 and 72 billion parameters. This repo contains the instruction-tuned 7B Qwen2.5-VL model with AWQ. For more information, visit our Blog and GitHub. ## Evaluation ## Requirements The code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command: or you might encounter the following error: ## Quickstart Below, we provide simple examples to show how to use Qwen2.5-VL with 🤖 ModelScope and 🤗 Transformers. The code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command: or you might encounter the following error: We offer a toolkit to help you handle various types of visual input more conveniently, as if you were using an API. This includes base64, URLs, and interleaved images and videos. You can install it using the following command: If you are not using Linux, you might not be able to install from PyPI. In that case, you can use which will fall back to using torchvision for video processing. However, you can still install decord from source to get decord used when loading video. ### Using 🤗 Transformers to Chat Here we show a code snippet to show you how to use the chat model with and : ### 🤖 ModelScope We strongly advise users especially those in mainland China to use ModelScope. can help you solve issues concerning downloading checkpoints. ### More Usage Tips For input images, we support local files, base64, and URLs. For videos, we currently only support local files. #### Image Resolution for performance boost The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage. Besides, We provide two methods for fine-grained control over the image size input to the model: 1. Define min_pixels and max_pixels: Images will be resized to maintain their aspect ratio within the range of min_pixels and max_pixels. 2. Specify exact dimensions: Directly set and . These values will be rounded to the nearest multiple of 28. ### Processing Long Texts The current is set for context length up to 32,768 tokens. To handle extensive inputs exceeding 32,768 tokens, we utilize YaRN, a technique for enhancing model length extrapolation, ensuring optimal performance on lengthy texts. For supported frameworks, you could add the following to to enable YaRN: { ..., \"type\": \"yarn\", \"mrope_section\": [ 16, 24, 24 ], \"factor\": 4, \"original_max_position_embeddings\": 32768 } However, it should be noted that this method has a significant impact on the performance of temporal and spatial localization tasks, and is therefore not recommended for use. At the same time, for long video inputs, since MRoPE itself is more economical with ids, the max_position_embeddings can be directly modified to a larger value, such as 64k. ### Benchmark #### Performance of Quantized Models This section reports the generation performance of quantized models (including GPTQ and AWQ) of the Qwen2.5-VL series. Specifically, we report: - MMMU_VAL (Accuracy) - DocVQA_VAL (Accuracy) - MMBench_DEV_EN (Accuracy) - MathVista_MINI (Accuracy) We use VLMEvalkit to evaluate all models. | Model Size | Quantization | MMMU_VAL | DocVQA_VAL | MMBench_EDV_EN | MathVista_MINI | | --- | --- | --- | --- | --- | --- | | Qwen2.5-VL-72B-Instruct | BF16<br><sup>(🤗🤖) | 70.0 | 96.1 | 88.2 | 75.3 | | | AWQ<br><sup>(🤗🤖) | 69.1 | 96.0 | 87.9 | 73.8 | | Qwen2.5-VL-7B-Instruct | BF16<br><sup>(🤗🤖) | 58.4 | 94.9 | 84.1 | 67.9 | | | AWQ<br><sup>(🤗🤖) | 55.6 | 94.6 | 84.2 | 64.7 | | Qwen2.5-VL-3B-Instruct | BF16<br><sup>(🤗🤖) | 51.7 | 93.0 | 79.8 | 61.4 | | | AWQ<br><sup>(🤗🤖) | 49.1 | 91.8 | 78.0 | 58.8 | ## Citation If you find our work helpful, feel free to give us a cite.",
25
+ "model_explanation_gemini": "**A multimodal vision-language model excelling in visual understanding, agentic reasoning, long-video comprehension, object localization, and structured output generation for tasks like document analysis.** \n\n### **Features:** \n1. **Visual Understanding** – Recognizes objects, analyzes text/graphics in images, and interprets charts/layouts. \n2. **Agentic Capabilities** – Functions as a visual agent, dynamically using tools for computer/phone tasks. \n3. **Long-Video Processing** –"
26
+ }
model_data_json/Qwen_Qwen3-32B.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen3-32B",
3
+ "downloads": 75675,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen3",
8
+ "text-generation",
9
+ "conversational",
10
+ "arxiv:2309.00071",
11
+ "license:apache-2.0",
12
+ "autotrain_compatible",
13
+ "endpoints_compatible",
14
+ "region:us"
15
+ ],
16
+ "description": "--- library_name: transformers license: apache-2.0 license_link: pipeline_tag: text-generation --- # Qwen3-32B <a href=\" target=\"_blank\" style=\"margin: 2px;\"> <img alt=\"Chat\" src=\" style=\"display: inline-block; vertical-align: middle;\"/> </a> ## Qwen3 Highlights Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support, with the following key features: - **Uniquely support of seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose dialogue) **within single model**, ensuring optimal performance across various scenarios. - **Significantly enhancement in its reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning. - **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience. - **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks. - **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**. ## Model Overview **Qwen3-32B** has the following features: - Type: Causal Language Models - Training Stage: Pretraining & Post-training - Number of Parameters: 32.8B - Number of Paramaters (Non-Embedding): 31.2B - Number of Layers: 64 - Number of Attention Heads (GQA): 64 for Q and 8 for KV - Context Length: 32,768 natively and 131,072 tokens with YaRN. For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our blog, GitHub, and Documentation. ## Quickstart The code of Qwen3 has been in the latest Hugging Face and we advise you to use the latest version of . With , you will encounter the following error: The following contains a code snippet illustrating how to use the model generate content based on given inputs. For deployment, you can use or or to create an OpenAI-compatible API endpoint: - SGLang: - vLLM: For local use, applications such as Ollama, LMStudio, MLX-LM, llama.cpp, and KTransformers have also supported Qwen3. ## Switching Between Thinking and Non-Thinking Mode > [!TIP] > The switch is also available in APIs created by SGLang and vLLM. > Please refer to our documentation for SGLang and vLLM users. ### By default, Qwen3 has thinking capabilities enabled, similar to QwQ-32B. This means the model will use its reasoning abilities to enhance the quality of generated responses. For example, when explicitly setting or leaving it as the default value in , the model will engage its thinking mode. In this mode, the model will generate think content wrapped in a block, followed by the final response. > [!NOTE] > For thinking mode, use , , , and (the default setting in ). **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. For more detailed guidance, please refer to the Best Practices section. ### We provide a hard switch to strictly disable the model's thinking behavior, aligning its functionality with the previous Qwen2.5-Instruct models. This mode is particularly useful in scenarios where disabling thinking is essential for enhancing efficiency. In this mode, the model will not generate any think content and will not include a block. > [!NOTE] > For non-thinking mode, we suggest using , , , and . For more detailed guidance, please refer to the Best Practices section. ### Advanced Usage: Switching Between Thinking and Non-Thinking Modes via User Input We provide a soft switch mechanism that allows users to dynamically control the model's behavior when . Specifically, you can add and to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations. Here is an example of a multi-turn conversation: > [!NOTE] > For API compatibility, when , regardless of whether the user uses or , the model will always output a block wrapped in . However, the content inside this block may be empty if thinking is disabled. > When , the soft switches are not valid. Regardless of any or tags input by the user, the model will not generate think content and will not include a block. ## Agentic Use Qwen3 excels in tool calling capabilities. We recommend using Qwen-Agent to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity. To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself. ## Processing Long Texts Qwen3 natively supports context lengths of up to 32,768 tokens. For conversations where the total length (including both input and output) significantly exceeds this limit, we recommend using RoPE scaling techniques to handle long texts effectively. We have validated the model's performance on context lengths of up to 131,072 tokens using the YaRN method. YaRN is currently supported by several inference frameworks, e.g., and for local use, and for deployment. In general, there are two approaches to enabling YaRN for supported frameworks: - Modifying the model files: In the file, add the fields: For , you need to regenerate the GGUF file after the modification. - Passing command line arguments: For , you can use For , you can use For from , you can use > [!IMPORTANT] > If you encounter the following warning > > please upgrade . > [!NOTE] > All the notable open-source frameworks implement static YaRN, which means the scaling factor remains constant regardless of input length, **potentially impacting performance on shorter texts.** > We advise adding the configuration only when processing long contexts is required. > It is also recommended to modify the as needed. For example, if the typical context length for your application is 65,536 tokens, it would be better to set as 2.0. > [!NOTE] > The default in is set to 40,960. This allocation includes reserving 32,768 tokens for outputs and 8,192 tokens for typical prompts, which is sufficient for most scenarios involving short text processing. If the average context length does not exceed 32,768 tokens, we do not recommend enabling YaRN in this scenario, as it may potentially degrade model performance. > [!TIP] > The endpoint provided by Alibaba Model Studio supports dynamic YaRN by default and no extra configuration is needed. ## Best Practices To achieve optimal performance, we recommend the following settings: 1. **Sampling Parameters**: - For thinking mode (), use , , , and . **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. - For non-thinking mode (), we suggest using , , , and . - For supported frameworks, you can adjust the parameter between 0 and 2 to reduce endless repetitions. However, using a higher value may occasionally result in language mixing and a slight decrease in model performance. 2. **Adequate Output Length**: We recommend using an output length of 32,768 tokens for most queries. For benchmarking on highly complex problems, such as those found in math and programming competitions, we suggest setting the max output length to 38,912 tokens. This provides the model with sufficient space to generate detailed and comprehensive responses, thereby enhancing its overall performance. 3. **Standardize Output Format**: We recommend using prompts to standardize model outputs when benchmarking. - **Math Problems**: Include \"Please reason step by step, and put your final answer within \\boxed{}.\" in the prompt. - **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: \"Please show your choice in the field with only the choice letter, e.g., .\" 4. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. It is implemented in the provided chat template in Jinja2. However, for frameworks that do not directly use the Jinja2 chat template, it is up to the developers to ensure that the best practice is followed. ### Citation If you find our work helpful, feel free to give us a cite.",
17
+ "model_explanation_gemini": "A 32.8B-parameter causal language model designed for advanced reasoning, multilingual tasks, and agent capabilities, featuring seamless switching between thinking (complex reasoning) and non-thinking (efficient dialogue) modes. \n\n**Features:** \n- **Dual-mode operation:** Toggle between thinking (logic/math/coding) and non-thinking (general dialogue) modes. \n- **Enhanced reasoning:** Outperforms predecessors (QwQ, Qwen2.5) in math, code,"
18
+ }
model_data_json/Qwen_Qwen3-4B.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen3-4B",
3
+ "downloads": 81038,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen3",
8
+ "text-generation",
9
+ "conversational",
10
+ "arxiv:2309.00071",
11
+ "base_model:Qwen/Qwen3-4B-Base",
12
+ "base_model:finetune:Qwen/Qwen3-4B-Base",
13
+ "license:apache-2.0",
14
+ "autotrain_compatible",
15
+ "endpoints_compatible",
16
+ "region:us"
17
+ ],
18
+ "description": "--- library_name: transformers license: apache-2.0 license_link: pipeline_tag: text-generation base_model: - Qwen/Qwen3-4B-Base --- # Qwen3-4B <a href=\" target=\"_blank\" style=\"margin: 2px;\"> <img alt=\"Chat\" src=\" style=\"display: inline-block; vertical-align: middle;\"/> </a> ## Qwen3 Highlights Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support, with the following key features: - **Uniquely support of seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose dialogue) **within single model**, ensuring optimal performance across various scenarios. - **Significantly enhancement in its reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning. - **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience. - **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks. - **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**. ## Model Overview **Qwen3-4B** has the following features: - Type: Causal Language Models - Training Stage: Pretraining & Post-training - Number of Parameters: 4.0B - Number of Paramaters (Non-Embedding): 3.6B - Number of Layers: 36 - Number of Attention Heads (GQA): 32 for Q and 8 for KV - Context Length: 32,768 natively and 131,072 tokens with YaRN. For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our blog, GitHub, and Documentation. > [!TIP] > If you encounter significant endless repetitions, please refer to the Best Practices section for optimal sampling parameters, and set the `transformerstransformerstransformers<4.51.0sglang>=0.4.6.post1vllm>=0.8.5enable_thinkingenable_thinking=Trueenable_thinking=Truetokenizer.apply_chat_template<think>...</think>Temperature=0.6TopP=0.95TopK=20MinP=0generation_config.jsonenable_thinking=False<think>...</think>Temperature=0.7TopP=0.8TopK=20MinP=0enable_thinking=True/think/no_thinkenable_thinking=True/think/no_think<think>...</think>enable_thinking=False/think/no_think<think>...</think>transformersllama.cppvllmsglangconfig.jsonrope_scalingllama.cppvllmsglangllama-serverllama.cpptransformers>=4.51.0rope_scalingfactorfactormax_position_embeddingsconfig.jsonenable_thinking=TrueTemperature=0.6TopP=0.95TopK=20MinP=0enable_thinking=FalseTemperature=0.7TopP=0.8TopK=20MinP=0presence_penaltyanswer\"answer\": \"C\"`.\" 4. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. It is implemented in the provided chat template in Jinja2. However, for frameworks that do not directly use the Jinja2 chat template, it is up to the developers to ensure that the best practice is followed. ### Citation If you find our work helpful, feel free to give us a cite.",
19
+ "model_explanation_gemini": "A 4B-parameter causal language model specializing in reasoning, multilingual tasks, and agent capabilities, with unique switching between thinking (complex reasoning) and non-thinking (general dialogue) modes. \n\n**Features**: \n- **Dual-mode operation**: Toggles between thinking (logic/math/coding) and non-thinking (general dialogue) modes. \n- **Enhanced reasoning**: Outperforms previous Qwen models in math, code, and logical tasks. \n- **Human-aligned interactions**:"
20
+ }
model_data_json/Qwen_Qwen3-8B.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Qwen/Qwen3-8B",
3
+ "downloads": 78129,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "qwen3",
8
+ "text-generation",
9
+ "conversational",
10
+ "arxiv:2309.00071",
11
+ "base_model:Qwen/Qwen3-8B-Base",
12
+ "base_model:finetune:Qwen/Qwen3-8B-Base",
13
+ "license:apache-2.0",
14
+ "autotrain_compatible",
15
+ "endpoints_compatible",
16
+ "region:us"
17
+ ],
18
+ "description": "--- library_name: transformers license: apache-2.0 license_link: pipeline_tag: text-generation base_model: - Qwen/Qwen3-8B-Base --- # Qwen3-8B <a href=\" target=\"_blank\" style=\"margin: 2px;\"> <img alt=\"Chat\" src=\" style=\"display: inline-block; vertical-align: middle;\"/> </a> ## Qwen3 Highlights Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support, with the following key features: - **Uniquely support of seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose dialogue) **within single model**, ensuring optimal performance across various scenarios. - **Significantly enhancement in its reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning. - **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience. - **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks. - **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**. ## Model Overview **Qwen3-8B** has the following features: - Type: Causal Language Models - Training Stage: Pretraining & Post-training - Number of Parameters: 8.2B - Number of Paramaters (Non-Embedding): 6.95B - Number of Layers: 36 - Number of Attention Heads (GQA): 32 for Q and 8 for KV - Context Length: 32,768 natively and 131,072 tokens with YaRN. For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our blog, GitHub, and Documentation. ## Quickstart The code of Qwen3 has been in the latest Hugging Face and we advise you to use the latest version of . With , you will encounter the following error: The following contains a code snippet illustrating how to use the model generate content based on given inputs. For deployment, you can use or or to create an OpenAI-compatible API endpoint: - SGLang: - vLLM: For local use, applications such as Ollama, LMStudio, MLX-LM, llama.cpp, and KTransformers have also supported Qwen3. ## Switching Between Thinking and Non-Thinking Mode > [!TIP] > The switch is also available in APIs created by SGLang and vLLM. > Please refer to our documentation for SGLang and vLLM users. ### By default, Qwen3 has thinking capabilities enabled, similar to QwQ-32B. This means the model will use its reasoning abilities to enhance the quality of generated responses. For example, when explicitly setting or leaving it as the default value in , the model will engage its thinking mode. In this mode, the model will generate think content wrapped in a block, followed by the final response. > [!NOTE] > For thinking mode, use , , , and (the default setting in ). **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. For more detailed guidance, please refer to the Best Practices section. ### We provide a hard switch to strictly disable the model's thinking behavior, aligning its functionality with the previous Qwen2.5-Instruct models. This mode is particularly useful in scenarios where disabling thinking is essential for enhancing efficiency. In this mode, the model will not generate any think content and will not include a block. > [!NOTE] > For non-thinking mode, we suggest using , , , and . For more detailed guidance, please refer to the Best Practices section. ### Advanced Usage: Switching Between Thinking and Non-Thinking Modes via User Input We provide a soft switch mechanism that allows users to dynamically control the model's behavior when . Specifically, you can add and to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations. Here is an example of a multi-turn conversation: > [!NOTE] > For API compatibility, when , regardless of whether the user uses or , the model will always output a block wrapped in . However, the content inside this block may be empty if thinking is disabled. > When , the soft switches are not valid. Regardless of any or tags input by the user, the model will not generate think content and will not include a block. ## Agentic Use Qwen3 excels in tool calling capabilities. We recommend using Qwen-Agent to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity. To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself. ## Processing Long Texts Qwen3 natively supports context lengths of up to 32,768 tokens. For conversations where the total length (including both input and output) significantly exceeds this limit, we recommend using RoPE scaling techniques to handle long texts effectively. We have validated the model's performance on context lengths of up to 131,072 tokens using the YaRN method. YaRN is currently supported by several inference frameworks, e.g., and for local use, and for deployment. In general, there are two approaches to enabling YaRN for supported frameworks: - Modifying the model files: In the file, add the fields: For , you need to regenerate the GGUF file after the modification. - Passing command line arguments: For , you can use For , you can use For from , you can use > [!IMPORTANT] > If you encounter the following warning > > please upgrade . > [!NOTE] > All the notable open-source frameworks implement static YaRN, which means the scaling factor remains constant regardless of input length, **potentially impacting performance on shorter texts.** > We advise adding the configuration only when processing long contexts is required. > It is also recommended to modify the as needed. For example, if the typical context length for your application is 65,536 tokens, it would be better to set as 2.0. > [!NOTE] > The default in is set to 40,960. This allocation includes reserving 32,768 tokens for outputs and 8,192 tokens for typical prompts, which is sufficient for most scenarios involving short text processing. If the average context length does not exceed 32,768 tokens, we do not recommend enabling YaRN in this scenario, as it may potentially degrade model performance. > [!TIP] > The endpoint provided by Alibaba Model Studio supports dynamic YaRN by default and no extra configuration is needed. ## Best Practices To achieve optimal performance, we recommend the following settings: 1. **Sampling Parameters**: - For thinking mode (), use , , , and . **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. - For non-thinking mode (), we suggest using , , , and . - For supported frameworks, you can adjust the parameter between 0 and 2 to reduce endless repetitions. However, using a higher value may occasionally result in language mixing and a slight decrease in model performance. 2. **Adequate Output Length**: We recommend using an output length of 32,768 tokens for most queries. For benchmarking on highly complex problems, such as those found in math and programming competitions, we suggest setting the max output length to 38,912 tokens. This provides the model with sufficient space to generate detailed and comprehensive responses, thereby enhancing its overall performance. 3. **Standardize Output Format**: We recommend using prompts to standardize model outputs when benchmarking. - **Math Problems**: Include \"Please reason step by step, and put your final answer within \\boxed{}.\" in the prompt. - **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: \"Please show your choice in the field with only the choice letter, e.g., .\" 4. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. It is implemented in the provided chat template in Jinja2. However, for frameworks that do not directly use the Jinja2 chat template, it is up to the developers to ensure that the best practice is followed. ### Citation If you find our work helpful, feel free to give us a cite.",
19
+ "model_explanation_gemini": "\"Qwen3-8B is an 8.2B-parameter causal language model supporting seamless switching between thinking mode (for complex reasoning, math, coding) and non-thinking mode (efficient dialogue), excelling in multilingual tasks, agent capabilities, and human-aligned interactions.\"\n\n**Features**: \n- **Dual-mode operation**: Toggle between thinking (reasoning-focused) and non-thinking (general dialogue) modes. \n- **Enhanced reasoning**: Outperforms QwQ and"
20
+ }
model_data_json/RedHatAI_Mistral-Small-24B-Instruct-2501-FP8-dynamic.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-dynamic",
3
+ "downloads": 81938,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "mistral",
8
+ "text-generation",
9
+ "mistral-small",
10
+ "fp8",
11
+ "vllm",
12
+ "conversational",
13
+ "en",
14
+ "base_model:mistralai/Mistral-Small-24B-Instruct-2501",
15
+ "base_model:quantized:mistralai/Mistral-Small-24B-Instruct-2501",
16
+ "license:apache-2.0",
17
+ "autotrain_compatible",
18
+ "text-generation-inference",
19
+ "endpoints_compatible",
20
+ "compressed-tensors",
21
+ "region:us"
22
+ ],
23
+ "description": "--- license: apache-2.0 language: - en tags: - mistral - mistral-small - fp8 - vllm base_model: mistralai/Mistral-Small-24B-Instruct-2501 library_name: transformers --- # Mistral-Small-24B-Instruct-2501-FP8-Dynamic ## Model Overview - **Model Architecture:** Mistral-Small-24B-Instruct-2501 - **Input:** Text - **Output:** Text - **Model Optimizations:** - **Weight quantization:** FP8 - **Activation quantization:** FP8 - **Release Date:** 3/1/2025 - **Version:** 1.0 - **Model Developers:** Neural Magic Quantized version of Mistral-Small-24B-Instruct-2501. It achieves an average score of 78.88 on the OpenLLM benchmark (version 1), whereas the unquantized model achieves 79.45. ### Model Optimizations This model was obtained by quantizing the weights and activations to FP8 data type, ready for inference with vLLM. This optimization reduces the number of bits per parameter from 16 to 8, reducing the disk size and GPU memory requirements by approximately 50%. Only the weights and activations of the linear operators within transformers blocks are quantized. ## Deployment ### Use with vLLM This model can be deployed efficiently using the vLLM backend, as shown in the example below. vLLM also supports OpenAI-compatible serving. See the documentation for more details. ## Creation This model was created with llm-compressor by running the code snippet below. ## Evaluation The model was evaluated on OpenLLM Leaderboard V1 and V2, using the following commands: OpenLLM Leaderboard V1: OpenLLM Leaderboard V2: ### Accuracy #### OpenLLM Leaderboard V1 evaluation scores | Metric | mistralai/Mistral-Small-24B-Instruct-2501 | nm-testing/Mistral-Small-24B-Instruct-2501-FP8-dynamic | |-----------------------------------------|:---------------------------------:|:-------------------------------------------:| | ARC-Challenge (Acc-Norm, 25-shot) | 72.18 | 71.76 | | GSM8K (Strict-Match, 5-shot) | 90.14 | 89.01 | | HellaSwag (Acc-Norm, 10-shot) | 85.05 | 84.65 | | MMLU (Acc, 5-shot) | 80.69 | 80.55 | | TruthfulQA (MC2, 0-shot) | 65.55 | 64.85 | | Winogrande (Acc, 5-shot) | 83.11 | 82.48 | | **Average Score** | **79.45** | **78.88** | | **Recovery (%)** | **100.00** | **99.28** | #### OpenLLM Leaderboard V2 evaluation scores | Metric | mistralai/Mistral-Small-24B-Instruct-2501 | nm-testing/Mistral-Small-24B-Instruct-2501-FP8-dynamic | |---------------------------------------------------------|:---------------------------------:|:-------------------------------------------:| | IFEval (Inst-and-Prompt Level Strict Acc, 0-shot) | 73.27 | 73.53 | | BBH (Acc-Norm, 3-shot) | 45.18 | 44.39 | | MMLU-Pro (Acc, 5-shot) | 38.83 | 37.28 | | **Average Score** | **52.42** | **51.73** | | **Recovery (%)** | **100.00** | **98.68** | | Math-Hard (Exact-Match, 4-shot) | 6.35 | 2.99 | | GPQA (Acc-Norm, 0-shot) | 8.29 | 6.97 | | MUSR (Acc-Norm, 0-shot) | 7.84 | 8.04 | Results on Math-Hard, GPQA, and MUSR are not considred for accuracy recovery calculation because the unquantized model has close to random prediction accuracy (6.35, 8.29, 7.84) which doesn't provide a reliable baseline for recovery calculation.",
24
+ "model_explanation_gemini": "A quantized FP8 version of Mistral-Small-24B-Instruct-2501 optimized for efficient inference with reduced memory and disk requirements while maintaining near-original accuracy (78.88 vs. 79.45 on OpenLLM benchmark). \n\n**Features:** \n- **Architecture:** Mistral-Small-24B-Instruct-2501 \n- **Quantization:** FP8 weights and activations (50% smaller size/memory) \n- **Task:** Text-to"
25
+ }
model_data_json/Revai_reverb-diarization-v1.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Revai/reverb-diarization-v1",
3
+ "downloads": 175090,
4
+ "tags": [
5
+ "pyannote-audio",
6
+ "pytorch",
7
+ "reverb",
8
+ "automatic-speech-recognition",
9
+ "arxiv:2410.03930",
10
+ "license:other",
11
+ "region:us"
12
+ ],
13
+ "description": "--- license: other library_name: pyannote-audio tags: - reverb pipeline_tag: automatic-speech-recognition --- Details on the model, it's performance, and more available on Arxiv. For more information on how to run this diarization model see Reverb diarization V1 provides a 16.5% relative improvement in WDER (Word Diarization Error Rate) compared to the baseline pyannote3.0 model, evaluated on over 1,250,000 tokens across five different test suites. | Test suite | WDER | |---------| --------| |earnings21 | 0.047 | |rev16 | 0.077| # Usage # Cite this Model If you use this model please use the following citation: # License See LICENSE for details.",
14
+ "model_explanation_gemini": "\"Revai_reverb-diarization-v1 is a speech diarization model that improves word-level speaker attribution accuracy, reducing WDER by 16.5% compared to pyannote3.0.\"\n\nFeatures: \n- 16.5% lower WDER than pyannote3.0 \n- Evaluated on 1.25M+ tokens across 5 test suites \n- Specific performance metrics: 0.047 WDER (earnings21), 0.077"
15
+ }
model_data_json/Rostlab_prot_t5_xl_uniref50.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "Rostlab/prot_t5_xl_uniref50",
3
+ "downloads": 73585,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "t5",
8
+ "text2text-generation",
9
+ "protein language model",
10
+ "dataset:UniRef50",
11
+ "autotrain_compatible",
12
+ "text-generation-inference",
13
+ "endpoints_compatible",
14
+ "region:us"
15
+ ],
16
+ "description": "--- tags: - protein language model datasets: - UniRef50 --- # ProtT5-XL-UniRef50 model Pretrained model on protein sequences using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is trained on uppercase amino acids: it only works with capital letter amino acids. ## Model description ProtT5-XL-UniRef50 is based on the model and was pretrained on a large corpus of protein sequences in a self-supervised fashion. This means it was pretrained on the raw protein sequences only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those protein sequences. One important difference between this T5 model and the original T5 version is the denosing objective. The original T5-3B model was pretrained using a span denosing objective, while this model was pre-trained with a Bart-like MLM denosing objective. The masking probability is consistent with the original T5 training by randomly masking 15% of the amino acids in the input. It has been shown that the features extracted from this self-supervised model (LM-embeddings) captured important biophysical properties governing protein shape. shape. This implied learning some of the grammar of the language of life realized in protein sequences. ## Intended uses & limitations The model could be used for protein feature extraction or to be fine-tuned on downstream tasks. We have noticed in some tasks on can gain more accuracy by fine-tuning the model rather than using it as a feature extractor. We have also noticed that for feature extraction, its better to use the feature extracted from the encoder not from the decoder. ### How to use Here is how to use this model to extract the features of a given protein sequence in PyTorch: ## Training data The ProtT5-XL-UniRef50 model was pretrained on UniRef50, a dataset consisting of 45 million protein sequences. ## Training procedure ### Preprocessing The protein sequences are uppercased and tokenized using a single space and a vocabulary size of 21. The rare amino acids \"U,Z,O,B\" were mapped to \"X\". The inputs of the model are then of the form: The preprocessing step was performed on the fly, by cutting and padding the protein sequences up to 512 tokens. The details of the masking procedure for each sequence are as follows: - 15% of the amino acids are masked. - In 90% of the cases, the masked amino acids are replaced by token. - In 10% of the cases, the masked amino acids are replaced by a random amino acid (different) from the one they replace. ### Pretraining The model was trained on a single TPU Pod V2-256 for 991.5 thousand steps in total, using sequence length 512 (batch size 2k). It was trained using ProtT5-XL-BFD model as an initial checkpoint, rather than training from scratch. It has a total of approximately 3B parameters and was trained using the encoder-decoder architecture. The optimizer used is AdaFactor with inverse square root learning rate schedule for pre-training. ## Evaluation results When the model is used for feature extraction, this model achieves the following results: Test results : | Task/Dataset | secondary structure (3-states) | secondary structure (8-states) | Localization | Membrane | |:-----:|:-----:|:-----:|:-----:|:-----:| | CASP12 | 81 | 70 | | | | TS115 | 87 | 77 | | | | CB513 | 86 | 74 | | | | DeepLoc | | | 81 | 91 | ### BibTeX entry and citation info > Created by Ahmed Elnaggar/@Elnaggar_AI | LinkedIn",
17
+ "model_explanation_gemini": "Pretrained on UniRef50 protein sequences using masked language modeling, ProtT5-XL-UniRef50 extracts protein features or fine-tunes for downstream tasks, capturing biophysical properties through self-supervised learning on uppercase amino acids. \n\n**Features:** \n- Masked language modeling (MLM) objective (15% masking) \n- Trained on 45M UniRef50 protein sequences (uppercased) \n- 3B parameters, encoder-decoder architecture \n- Input:"
18
+ }
model_data_json/Snowflake_snowflake-arctic-embed-m-v2.0.json ADDED
The diff for this file is too large to render. See raw diff
 
model_data_json/THUDM_glm-4-voice-tokenizer.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "THUDM/glm-4-voice-tokenizer",
3
+ "downloads": 78346,
4
+ "tags": [
5
+ "safetensors",
6
+ "whisper",
7
+ "region:us"
8
+ ],
9
+ "description": "# GLM-4-Voice-Tokenizer GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够根据用户的指令改变语音的情感、语调、语速、方言等属性。 GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions. 本仓库是 GLM-4-Voice 的 speech tokenizer 部分。通过在 Whisper 的 encoder 部分增加 vector quantization 进行训练,将连续的语音输入转化为离散的 token。每秒音频转化为 12.5 个离散 token。 The repo provides the speech tokenzier of GLM-4-Voice, which is trained by adding vector quantization to the encoder part of Whisper and converts continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens. 更多信息请参考我们的仓库 GLM-4-Voice. For more information please refer to our repo GLM-4-Voice.",
10
+ "model_explanation_gemini": "\"GLM-4-Voice-Tokenizer converts continuous speech into discrete tokens (12.5 tokens per second) using Whisper's encoder with added vector quantization, enabling GLM-4-Voice's end-to-end multilingual speech understanding, generation, and real-time voice modulation (emotion, tone, speed, dialect).\" \n\n**Features**: \n1. Speech-to-token conversion (12.5 tokens/sec) \n2. Based on Whisper’s encoder + vector quantization"
11
+ }
model_data_json/TaylorAI_gte-tiny.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TaylorAI/gte-tiny",
3
+ "downloads": 74970,
4
+ "tags": [
5
+ "sentence-transformers",
6
+ "pytorch",
7
+ "onnx",
8
+ "safetensors",
9
+ "bert",
10
+ "feature-extraction",
11
+ "sentence-similarity",
12
+ "transformers",
13
+ "mteb",
14
+ "model-index",
15
+ "autotrain_compatible",
16
+ "text-embeddings-inference",
17
+ "endpoints_compatible",
18
+ "region:us"
19
+ ],
20
+ "description": "--- model-index: - name: gte_tiny results: - task: type: Classification dataset: type: mteb/amazon_counterfactual name: MTEB AmazonCounterfactualClassification (en) config: en split: test revision: e8379541af4e31359cca9fbcf4b00f2671dba205 metrics: - type: accuracy value: 71.76119402985076 - type: ap value: 34.63659287952359 - type: f1 value: 65.88939512571113 - task: type: Classification dataset: type: mteb/amazon_polarity name: MTEB AmazonPolarityClassification config: default split: test revision: e2d317d38cd51312af73b3d32a06d1a08b442046 metrics: - type: accuracy value: 86.61324999999998 - type: ap value: 81.7476302802319 - type: f1 value: 86.5863470912001 - task: type: Classification dataset: type: mteb/amazon_reviews_multi name: MTEB AmazonReviewsClassification (en) config: en split: test revision: 1399c76144fd37290681b995c656ef9b2e06e26d metrics: - type: accuracy value: 42.61000000000001 - type: f1 value: 42.2217180000715 - task: type: Retrieval dataset: type: arguana name: MTEB ArguAna config: default split: test revision: None metrics: - type: map_at_1 value: 28.377999999999997 - type: map_at_10 value: 44.565 - type: map_at_100 value: 45.48 - type: map_at_1000 value: 45.487 - type: map_at_3 value: 39.841 - type: map_at_5 value: 42.284 - type: mrr_at_1 value: 29.445 - type: mrr_at_10 value: 44.956 - type: mrr_at_100 value: 45.877 - type: mrr_at_1000 value: 45.884 - type: mrr_at_3 value: 40.209 - type: mrr_at_5 value: 42.719 - type: ndcg_at_1 value: 28.377999999999997 - type: ndcg_at_10 value: 53.638 - type: ndcg_at_100 value: 57.354000000000006 - type: ndcg_at_1000 value: 57.513000000000005 - type: ndcg_at_3 value: 43.701 - type: ndcg_at_5 value: 48.114000000000004 - type: precision_at_1 value: 28.377999999999997 - type: precision_at_10 value: 8.272 - type: precision_at_100 value: 0.984 - type: precision_at_1000 value: 0.1 - type: precision_at_3 value: 18.303 - type: precision_at_5 value: 13.129 - type: recall_at_1 value: 28.377999999999997 - type: recall_at_10 value: 82.717 - type: recall_at_100 value: 98.43499999999999 - type: recall_at_1000 value: 99.644 - type: recall_at_3 value: 54.908 - type: recall_at_5 value: 65.647 - task: type: Clustering dataset: type: mteb/arxiv-clustering-p2p name: MTEB ArxivClusteringP2P config: default split: test revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d metrics: - type: v_measure value: 46.637318326729876 - task: type: Clustering dataset: type: mteb/arxiv-clustering-s2s name: MTEB ArxivClusteringS2S config: default split: test revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53 metrics: - type: v_measure value: 36.01134479855804 - task: type: Reranking dataset: type: mteb/askubuntudupquestions-reranking name: MTEB AskUbuntuDupQuestions config: default split: test revision: 2000358ca161889fa9c082cb41daa8dcfb161a54 metrics: - type: map value: 59.82917555338909 - type: mrr value: 74.7888361254012 - task: type: STS dataset: type: mteb/biosses-sts name: MTEB BIOSSES config: default split: test revision: d3fb88f8f02e40887cd149695127462bbcf29b4a metrics: - type: cos_sim_pearson value: 87.1657730995964 - type: cos_sim_spearman value: 86.62787748941281 - type: euclidean_pearson value: 85.48127914481798 - type: euclidean_spearman value: 86.48148861167424 - type: manhattan_pearson value: 85.07496934780823 - type: manhattan_spearman value: 86.39473964708843 - task: type: Classification dataset: type: mteb/banking77 name: MTEB Banking77Classification config: default split: test revision: 0fd18e25b25c072e09e0d92ab615fda904d66300 metrics: - type: accuracy value: 81.73051948051948 - type: f1 value: 81.66368364988331 - task: type: Clustering dataset: type: mteb/biorxiv-clustering-p2p name: MTEB BiorxivClusteringP2P config: default split: test revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40 metrics: - type: v_measure value: 39.18623707448217 - task: type: Clustering dataset: type: mteb/biorxiv-clustering-s2s name: MTEB BiorxivClusteringS2S config: default split: test revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908 metrics: - type: v_measure value: 32.12697757150375 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackAndroidRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 29.160000000000004 - type: map_at_10 value: 40.474 - type: map_at_100 value: 41.905 - type: map_at_1000 value: 42.041000000000004 - type: map_at_3 value: 37.147000000000006 - type: map_at_5 value: 38.873999999999995 - type: mrr_at_1 value: 36.91 - type: mrr_at_10 value: 46.495999999999995 - type: mrr_at_100 value: 47.288000000000004 - type: mrr_at_1000 value: 47.339999999999996 - type: mrr_at_3 value: 43.777 - type: mrr_at_5 value: 45.257999999999996 - type: ndcg_at_1 value: 36.91 - type: ndcg_at_10 value: 46.722 - type: ndcg_at_100 value: 51.969 - type: ndcg_at_1000 value: 54.232 - type: ndcg_at_3 value: 41.783 - type: ndcg_at_5 value: 43.797000000000004 - type: precision_at_1 value: 36.91 - type: precision_at_10 value: 9.013 - type: precision_at_100 value: 1.455 - type: precision_at_1000 value: 0.193 - type: precision_at_3 value: 20.124 - type: precision_at_5 value: 14.363000000000001 - type: recall_at_1 value: 29.160000000000004 - type: recall_at_10 value: 58.521 - type: recall_at_100 value: 80.323 - type: recall_at_1000 value: 95.13000000000001 - type: recall_at_3 value: 44.205 - type: recall_at_5 value: 49.97 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackEnglishRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 27.750000000000004 - type: map_at_10 value: 36.39 - type: map_at_100 value: 37.5 - type: map_at_1000 value: 37.625 - type: map_at_3 value: 33.853 - type: map_at_5 value: 35.397 - type: mrr_at_1 value: 34.14 - type: mrr_at_10 value: 41.841 - type: mrr_at_100 value: 42.469 - type: mrr_at_1000 value: 42.521 - type: mrr_at_3 value: 39.724 - type: mrr_at_5 value: 40.955999999999996 - type: ndcg_at_1 value: 34.14 - type: ndcg_at_10 value: 41.409 - type: ndcg_at_100 value: 45.668 - type: ndcg_at_1000 value: 47.916 - type: ndcg_at_3 value: 37.836 - type: ndcg_at_5 value: 39.650999999999996 - type: precision_at_1 value: 34.14 - type: precision_at_10 value: 7.739 - type: precision_at_100 value: 1.2630000000000001 - type: precision_at_1000 value: 0.173 - type: precision_at_3 value: 18.217 - type: precision_at_5 value: 12.854 - type: recall_at_1 value: 27.750000000000004 - type: recall_at_10 value: 49.882 - type: recall_at_100 value: 68.556 - type: recall_at_1000 value: 83.186 - type: recall_at_3 value: 39.047 - type: recall_at_5 value: 44.458 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackGamingRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 36.879 - type: map_at_10 value: 48.878 - type: map_at_100 value: 49.918 - type: map_at_1000 value: 49.978 - type: map_at_3 value: 45.867999999999995 - type: map_at_5 value: 47.637 - type: mrr_at_1 value: 42.696 - type: mrr_at_10 value: 52.342 - type: mrr_at_100 value: 53.044000000000004 - type: mrr_at_1000 value: 53.077 - type: mrr_at_3 value: 50.01 - type: mrr_at_5 value: 51.437 - type: ndcg_at_1 value: 42.696 - type: ndcg_at_10 value: 54.469 - type: ndcg_at_100 value: 58.664 - type: ndcg_at_1000 value: 59.951 - type: ndcg_at_3 value: 49.419999999999995 - type: ndcg_at_5 value: 52.007000000000005 - type: precision_at_1 value: 42.696 - type: precision_at_10 value: 8.734 - type: precision_at_100 value: 1.1769999999999998 - type: precision_at_1000 value: 0.133 - type: precision_at_3 value: 22.027 - type: precision_at_5 value: 15.135000000000002 - type: recall_at_1 value: 36.879 - type: recall_at_10 value: 67.669 - type: recall_at_100 value: 85.822 - type: recall_at_1000 value: 95.092 - type: recall_at_3 value: 54.157999999999994 - type: recall_at_5 value: 60.436 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackGisRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 22.942 - type: map_at_10 value: 31.741999999999997 - type: map_at_100 value: 32.721000000000004 - type: map_at_1000 value: 32.809 - type: map_at_3 value: 29.17 - type: map_at_5 value: 30.714000000000002 - type: mrr_at_1 value: 24.746000000000002 - type: mrr_at_10 value: 33.517 - type: mrr_at_100 value: 34.451 - type: mrr_at_1000 value: 34.522000000000006 - type: mrr_at_3 value: 31.148999999999997 - type: mrr_at_5 value: 32.606 - type: ndcg_at_1 value: 24.746000000000002 - type: ndcg_at_10 value: 36.553000000000004 - type: ndcg_at_100 value: 41.53 - type: ndcg_at_1000 value: 43.811 - type: ndcg_at_3 value: 31.674000000000003 - type: ndcg_at_5 value: 34.241 - type: precision_at_1 value: 24.746000000000002 - type: precision_at_10 value: 5.684 - type: precision_at_100 value: 0.859 - type: precision_at_1000 value: 0.109 - type: precision_at_3 value: 13.597000000000001 - type: precision_at_5 value: 9.672 - type: recall_at_1 value: 22.942 - type: recall_at_10 value: 49.58 - type: recall_at_100 value: 72.614 - type: recall_at_1000 value: 89.89200000000001 - type: recall_at_3 value: 36.552 - type: recall_at_5 value: 42.702 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackMathematicaRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 15.345 - type: map_at_10 value: 22.428 - type: map_at_100 value: 23.756 - type: map_at_1000 value: 23.872 - type: map_at_3 value: 20.212 - type: map_at_5 value: 21.291 - type: mrr_at_1 value: 19.279 - type: mrr_at_10 value: 27.1 - type: mrr_at_100 value: 28.211000000000002 - type: mrr_at_1000 value: 28.279 - type: mrr_at_3 value: 24.813 - type: mrr_at_5 value: 25.889 - type: ndcg_at_1 value: 19.279 - type: ndcg_at_10 value: 27.36 - type: ndcg_at_100 value: 33.499 - type: ndcg_at_1000 value: 36.452 - type: ndcg_at_3 value: 23.233999999999998 - type: ndcg_at_5 value: 24.806 - type: precision_at_1 value: 19.279 - type: precision_at_10 value: 5.149 - type: precision_at_100 value: 0.938 - type: precision_at_1000 value: 0.133 - type: precision_at_3 value: 11.360000000000001 - type: precision_at_5 value: 8.035 - type: recall_at_1 value: 15.345 - type: recall_at_10 value: 37.974999999999994 - type: recall_at_100 value: 64.472 - type: recall_at_1000 value: 85.97200000000001 - type: recall_at_3 value: 26.203 - type: recall_at_5 value: 30.485 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackPhysicsRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 26.362000000000002 - type: map_at_10 value: 36.406 - type: map_at_100 value: 37.726 - type: map_at_1000 value: 37.84 - type: map_at_3 value: 33.425 - type: map_at_5 value: 35.043 - type: mrr_at_1 value: 32.146 - type: mrr_at_10 value: 41.674 - type: mrr_at_100 value: 42.478 - type: mrr_at_1000 value: 42.524 - type: mrr_at_3 value: 38.948 - type: mrr_at_5 value: 40.415 - type: ndcg_at_1 value: 32.146 - type: ndcg_at_10 value: 42.374 - type: ndcg_at_100 value: 47.919 - type: ndcg_at_1000 value: 50.013 - type: ndcg_at_3 value: 37.29 - type: ndcg_at_5 value: 39.531 - type: precision_at_1 value: 32.146 - type: precision_at_10 value: 7.767 - type: precision_at_100 value: 1.236 - type: precision_at_1000 value: 0.16 - type: precision_at_3 value: 17.965999999999998 - type: precision_at_5 value: 12.742999999999999 - type: recall_at_1 value: 26.362000000000002 - type: recall_at_10 value: 54.98800000000001 - type: recall_at_100 value: 78.50200000000001 - type: recall_at_1000 value: 92.146 - type: recall_at_3 value: 40.486 - type: recall_at_5 value: 46.236 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackProgrammersRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 24.417 - type: map_at_10 value: 33.161 - type: map_at_100 value: 34.357 - type: map_at_1000 value: 34.473 - type: map_at_3 value: 30.245 - type: map_at_5 value: 31.541999999999998 - type: mrr_at_1 value: 29.909000000000002 - type: mrr_at_10 value: 38.211 - type: mrr_at_100 value: 39.056999999999995 - type: mrr_at_1000 value: 39.114 - type: mrr_at_3 value: 35.769 - type: mrr_at_5 value: 36.922 - type: ndcg_at_1 value: 29.909000000000002 - type: ndcg_at_10 value: 38.694 - type: ndcg_at_100 value: 44.057 - type: ndcg_at_1000 value: 46.6 - type: ndcg_at_3 value: 33.822 - type: ndcg_at_5 value: 35.454 - type: precision_at_1 value: 29.909000000000002 - type: precision_at_10 value: 7.180000000000001 - type: precision_at_100 value: 1.153 - type: precision_at_1000 value: 0.155 - type: precision_at_3 value: 16.134 - type: precision_at_5 value: 11.256 - type: recall_at_1 value: 24.417 - type: recall_at_10 value: 50.260000000000005 - type: recall_at_100 value: 73.55699999999999 - type: recall_at_1000 value: 91.216 - type: recall_at_3 value: 35.971 - type: recall_at_5 value: 40.793 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 24.266916666666663 - type: map_at_10 value: 32.75025 - type: map_at_100 value: 33.91341666666667 - type: map_at_1000 value: 34.031749999999995 - type: map_at_3 value: 30.166416666666674 - type: map_at_5 value: 31.577000000000005 - type: mrr_at_1 value: 28.828166666666664 - type: mrr_at_10 value: 36.80991666666667 - type: mrr_at_100 value: 37.67075 - type: mrr_at_1000 value: 37.733 - type: mrr_at_3 value: 34.513416666666664 - type: mrr_at_5 value: 35.788 - type: ndcg_at_1 value: 28.828166666666664 - type: ndcg_at_10 value: 37.796 - type: ndcg_at_100 value: 42.94783333333333 - type: ndcg_at_1000 value: 45.38908333333333 - type: ndcg_at_3 value: 33.374750000000006 - type: ndcg_at_5 value: 35.379666666666665 - type: precision_at_1 value: 28.828166666666664 - type: precision_at_10 value: 6.615749999999999 - type: precision_at_100 value: 1.0848333333333333 - type: precision_at_1000 value: 0.1484166666666667 - type: precision_at_3 value: 15.347833333333332 - type: precision_at_5 value: 10.848916666666666 - type: recall_at_1 value: 24.266916666666663 - type: recall_at_10 value: 48.73458333333333 - type: recall_at_100 value: 71.56341666666667 - type: recall_at_1000 value: 88.63091666666668 - type: recall_at_3 value: 36.31208333333333 - type: recall_at_5 value: 41.55633333333333 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackStatsRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 23.497 - type: map_at_10 value: 30.249 - type: map_at_100 value: 30.947000000000003 - type: map_at_1000 value: 31.049 - type: map_at_3 value: 28.188000000000002 - type: map_at_5 value: 29.332 - type: mrr_at_1 value: 26.687 - type: mrr_at_10 value: 33.182 - type: mrr_at_100 value: 33.794999999999995 - type: mrr_at_1000 value: 33.873 - type: mrr_at_3 value: 31.263 - type: mrr_at_5 value: 32.428000000000004 - type: ndcg_at_1 value: 26.687 - type: ndcg_at_10 value: 34.252 - type: ndcg_at_100 value: 38.083 - type: ndcg_at_1000 value: 40.682 - type: ndcg_at_3 value: 30.464999999999996 - type: ndcg_at_5 value: 32.282 - type: precision_at_1 value: 26.687 - type: precision_at_10 value: 5.2909999999999995 - type: precision_at_100 value: 0.788 - type: precision_at_1000 value: 0.109 - type: precision_at_3 value: 13.037 - type: precision_at_5 value: 9.049 - type: recall_at_1 value: 23.497 - type: recall_at_10 value: 43.813 - type: recall_at_100 value: 61.88399999999999 - type: recall_at_1000 value: 80.926 - type: recall_at_3 value: 33.332 - type: recall_at_5 value: 37.862 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackTexRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 16.073 - type: map_at_10 value: 22.705000000000002 - type: map_at_100 value: 23.703 - type: map_at_1000 value: 23.833 - type: map_at_3 value: 20.593 - type: map_at_5 value: 21.7 - type: mrr_at_1 value: 19.683 - type: mrr_at_10 value: 26.39 - type: mrr_at_100 value: 27.264 - type: mrr_at_1000 value: 27.349 - type: mrr_at_3 value: 24.409 - type: mrr_at_5 value: 25.474000000000004 - type: ndcg_at_1 value: 19.683 - type: ndcg_at_10 value: 27.014 - type: ndcg_at_100 value: 31.948 - type: ndcg_at_1000 value: 35.125 - type: ndcg_at_3 value: 23.225 - type: ndcg_at_5 value: 24.866 - type: precision_at_1 value: 19.683 - type: precision_at_10 value: 4.948 - type: precision_at_100 value: 0.876 - type: precision_at_1000 value: 0.133 - type: precision_at_3 value: 10.943 - type: precision_at_5 value: 7.86 - type: recall_at_1 value: 16.073 - type: recall_at_10 value: 36.283 - type: recall_at_100 value: 58.745999999999995 - type: recall_at_1000 value: 81.711 - type: recall_at_3 value: 25.637 - type: recall_at_5 value: 29.919 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackUnixRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 25.776 - type: map_at_10 value: 33.317 - type: map_at_100 value: 34.437 - type: map_at_1000 value: 34.54 - type: map_at_3 value: 30.706 - type: map_at_5 value: 32.202999999999996 - type: mrr_at_1 value: 30.224 - type: mrr_at_10 value: 37.34 - type: mrr_at_100 value: 38.268 - type: mrr_at_1000 value: 38.335 - type: mrr_at_3 value: 35.075 - type: mrr_at_5 value: 36.348 - type: ndcg_at_1 value: 30.224 - type: ndcg_at_10 value: 38.083 - type: ndcg_at_100 value: 43.413000000000004 - type: ndcg_at_1000 value: 45.856 - type: ndcg_at_3 value: 33.437 - type: ndcg_at_5 value: 35.661 - type: precision_at_1 value: 30.224 - type: precision_at_10 value: 6.1850000000000005 - type: precision_at_100 value: 1.0030000000000001 - type: precision_at_1000 value: 0.132 - type: precision_at_3 value: 14.646 - type: precision_at_5 value: 10.428999999999998 - type: recall_at_1 value: 25.776 - type: recall_at_10 value: 48.787000000000006 - type: recall_at_100 value: 72.04899999999999 - type: recall_at_1000 value: 89.339 - type: recall_at_3 value: 36.192 - type: recall_at_5 value: 41.665 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackWebmastersRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 23.156 - type: map_at_10 value: 30.886000000000003 - type: map_at_100 value: 32.551 - type: map_at_1000 value: 32.769 - type: map_at_3 value: 28.584 - type: map_at_5 value: 29.959999999999997 - type: mrr_at_1 value: 28.260999999999996 - type: mrr_at_10 value: 35.555 - type: mrr_at_100 value: 36.687 - type: mrr_at_1000 value: 36.742999999999995 - type: mrr_at_3 value: 33.531 - type: mrr_at_5 value: 34.717 - type: ndcg_at_1 value: 28.260999999999996 - type: ndcg_at_10 value: 36.036 - type: ndcg_at_100 value: 42.675000000000004 - type: ndcg_at_1000 value: 45.303 - type: ndcg_at_3 value: 32.449 - type: ndcg_at_5 value: 34.293 - type: precision_at_1 value: 28.260999999999996 - type: precision_at_10 value: 6.837999999999999 - type: precision_at_100 value: 1.4569999999999999 - type: precision_at_1000 value: 0.23500000000000001 - type: precision_at_3 value: 15.217 - type: precision_at_5 value: 11.028 - type: recall_at_1 value: 23.156 - type: recall_at_10 value: 45.251999999999995 - type: recall_at_100 value: 75.339 - type: recall_at_1000 value: 91.56 - type: recall_at_3 value: 34.701 - type: recall_at_5 value: 39.922999999999995 - task: type: Retrieval dataset: type: BeIR/cqadupstack name: MTEB CQADupstackWordpressRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 19.846 - type: map_at_10 value: 26.367 - type: map_at_100 value: 27.439999999999998 - type: map_at_1000 value: 27.552 - type: map_at_3 value: 24.006 - type: map_at_5 value: 25.230999999999998 - type: mrr_at_1 value: 21.257 - type: mrr_at_10 value: 28.071 - type: mrr_at_100 value: 29.037000000000003 - type: mrr_at_1000 value: 29.119 - type: mrr_at_3 value: 25.692999999999998 - type: mrr_at_5 value: 27.006000000000004 - type: ndcg_at_1 value: 21.257 - type: ndcg_at_10 value: 30.586000000000002 - type: ndcg_at_100 value: 35.949 - type: ndcg_at_1000 value: 38.728 - type: ndcg_at_3 value: 25.862000000000002 - type: ndcg_at_5 value: 27.967 - type: precision_at_1 value: 21.257 - type: precision_at_10 value: 4.861 - type: precision_at_100 value: 0.8130000000000001 - type: precision_at_1000 value: 0.116 - type: precision_at_3 value: 10.906 - type: precision_at_5 value: 7.763000000000001 - type: recall_at_1 value: 19.846 - type: recall_at_10 value: 41.805 - type: recall_at_100 value: 66.89699999999999 - type: recall_at_1000 value: 87.401 - type: recall_at_3 value: 29.261 - type: recall_at_5 value: 34.227000000000004 - task: type: Retrieval dataset: type: climate-fever name: MTEB ClimateFEVER config: default split: test revision: None metrics: - type: map_at_1 value: 10.333 - type: map_at_10 value: 17.14 - type: map_at_100 value: 18.878 - type: map_at_1000 value: 19.067 - type: map_at_3 value: 14.123 - type: map_at_5 value: 15.699 - type: mrr_at_1 value: 23.192 - type: mrr_at_10 value: 33.553 - type: mrr_at_100 value: 34.553 - type: mrr_at_1000 value: 34.603 - type: mrr_at_3 value: 29.848000000000003 - type: mrr_at_5 value: 32.18 - type: ndcg_at_1 value: 23.192 - type: ndcg_at_10 value: 24.707 - type: ndcg_at_100 value: 31.701 - type: ndcg_at_1000 value: 35.260999999999996 - type: ndcg_at_3 value: 19.492 - type: ndcg_at_5 value: 21.543 - type: precision_at_1 value: 23.192 - type: precision_at_10 value: 7.824000000000001 - type: precision_at_100 value: 1.52 - type: precision_at_1000 value: 0.218 - type: precision_at_3 value: 14.180000000000001 - type: precision_at_5 value: 11.530999999999999 - type: recall_at_1 value: 10.333 - type: recall_at_10 value: 30.142999999999997 - type: recall_at_100 value: 54.298 - type: recall_at_1000 value: 74.337 - type: recall_at_3 value: 17.602999999999998 - type: recall_at_5 value: 22.938 - task: type: Retrieval dataset: type: dbpedia-entity name: MTEB DBPedia config: default split: test revision: None metrics: - type: map_at_1 value: 8.03 - type: map_at_10 value: 17.345 - type: map_at_100 value: 23.462 - type: map_at_1000 value: 24.77 - type: map_at_3 value: 12.714 - type: map_at_5 value: 14.722 - type: mrr_at_1 value: 61.0 - type: mrr_at_10 value: 69.245 - type: mrr_at_100 value: 69.715 - type: mrr_at_1000 value: 69.719 - type: mrr_at_3 value: 67.583 - type: mrr_at_5 value: 68.521 - type: ndcg_at_1 value: 47.625 - type: ndcg_at_10 value: 35.973 - type: ndcg_at_100 value: 39.875 - type: ndcg_at_1000 value: 46.922000000000004 - type: ndcg_at_3 value: 40.574 - type: ndcg_at_5 value: 38.18 - type: precision_at_1 value: 61.0 - type: precision_at_10 value: 29.049999999999997 - type: precision_at_100 value: 8.828 - type: precision_at_1000 value: 1.8290000000000002 - type: precision_at_3 value: 45.333 - type: precision_at_5 value: 37.9 - type: recall_at_1 value: 8.03 - type: recall_at_10 value: 22.334 - type: recall_at_100 value: 45.919 - type: recall_at_1000 value: 68.822 - type: recall_at_3 value: 14.038999999999998 - type: recall_at_5 value: 17.118 - task: type: Classification dataset: type: mteb/emotion name: MTEB EmotionClassification config: default split: test revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37 metrics: - type: accuracy value: 44.714999999999996 - type: f1 value: 39.83929362259356 - task: type: Retrieval dataset: type: fever name: MTEB FEVER config: default split: test revision: None metrics: - type: map_at_1 value: 52.242999999999995 - type: map_at_10 value: 64.087 - type: map_at_100 value: 64.549 - type: map_at_1000 value: 64.567 - type: map_at_3 value: 61.667 - type: map_at_5 value: 63.266 - type: mrr_at_1 value: 56.271 - type: mrr_at_10 value: 68.146 - type: mrr_at_100 value: 68.524 - type: mrr_at_1000 value: 68.53200000000001 - type: mrr_at_3 value: 65.869 - type: mrr_at_5 value: 67.37100000000001 - type: ndcg_at_1 value: 56.271 - type: ndcg_at_10 value: 70.109 - type: ndcg_at_100 value: 72.09 - type: ndcg_at_1000 value: 72.479 - type: ndcg_at_3 value: 65.559 - type: ndcg_at_5 value: 68.242 - type: precision_at_1 value: 56.271 - type: precision_at_10 value: 9.286999999999999 - type: precision_at_100 value: 1.039 - type: precision_at_1000 value: 0.109 - type: precision_at_3 value: 26.308 - type: precision_at_5 value: 17.291 - type: recall_at_1 value: 52.242999999999995 - type: recall_at_10 value: 84.71 - type: recall_at_100 value: 93.309 - type: recall_at_1000 value: 96.013 - type: recall_at_3 value: 72.554 - type: recall_at_5 value: 79.069 - task: type: Retrieval dataset: type: fiqa name: MTEB FiQA2018 config: default split: test revision: None metrics: - type: map_at_1 value: 14.346 - type: map_at_10 value: 24.552 - type: map_at_100 value: 26.161 - type: map_at_1000 value: 26.345000000000002 - type: map_at_3 value: 21.208 - type: map_at_5 value: 22.959 - type: mrr_at_1 value: 29.166999999999998 - type: mrr_at_10 value: 38.182 - type: mrr_at_100 value: 39.22 - type: mrr_at_1000 value: 39.263 - type: mrr_at_3 value: 35.983 - type: mrr_at_5 value: 37.14 - type: ndcg_at_1 value: 29.166999999999998 - type: ndcg_at_10 value: 31.421 - type: ndcg_at_100 value: 38.129999999999995 - type: ndcg_at_1000 value: 41.569 - type: ndcg_at_3 value: 28.172000000000004 - type: ndcg_at_5 value: 29.029 - type: precision_at_1 value: 29.166999999999998 - type: precision_at_10 value: 8.997 - type: precision_at_100 value: 1.5709999999999997 - type: precision_at_1000 value: 0.22 - type: precision_at_3 value: 19.187 - type: precision_at_5 value: 13.980999999999998 - type: recall_at_1 value: 14.346 - type: recall_at_10 value: 37.963 - type: recall_at_100 value: 63.43299999999999 - type: recall_at_1000 value: 84.057 - type: recall_at_3 value: 26.119999999999997 - type: recall_at_5 value: 30.988 - task: type: Retrieval dataset: type: hotpotqa name: MTEB HotpotQA config: default split: test revision: None metrics: - type: map_at_1 value: 33.059 - type: map_at_10 value: 46.421 - type: map_at_100 value: 47.323 - type: map_at_1000 value: 47.403 - type: map_at_3 value: 43.553999999999995 - type: map_at_5 value: 45.283 - type: mrr_at_1 value: 66.117 - type: mrr_at_10 value: 73.10900000000001 - type: mrr_at_100 value: 73.444 - type: mrr_at_1000 value: 73.46000000000001 - type: mrr_at_3 value: 71.70400000000001 - type: mrr_at_5 value: 72.58099999999999 - type: ndcg_at_1 value: 66.117 - type: ndcg_at_10 value: 55.696999999999996 - type: ndcg_at_100 value: 59.167 - type: ndcg_at_1000 value: 60.809000000000005 - type: ndcg_at_3 value: 51.243 - type: ndcg_at_5 value: 53.627 - type: precision_at_1 value: 66.117 - type: precision_at_10 value: 11.538 - type: precision_at_100 value: 1.429 - type: precision_at_1000 value: 0.165 - type: precision_at_3 value: 31.861 - type: precision_at_5 value: 20.997 - type: recall_at_1 value: 33.059 - type: recall_at_10 value: 57.691 - type: recall_at_100 value: 71.458 - type: recall_at_1000 value: 82.35 - type: recall_at_3 value: 47.792 - type: recall_at_5 value: 52.492000000000004 - task: type: Classification dataset: type: mteb/imdb name: MTEB ImdbClassification config: default split: test revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7 metrics: - type: accuracy value: 80.544 - type: ap value: 74.69592367984956 - type: f1 value: 80.51138138449883 - task: type: Retrieval dataset: type: msmarco name: MTEB MSMARCO config: default split: dev revision: None metrics: - type: map_at_1 value: 17.095 - type: map_at_10 value: 28.038999999999998 - type: map_at_100 value: 29.246 - type: map_at_1000 value: 29.311 - type: map_at_3 value: 24.253 - type: map_at_5 value: 26.442 - type: mrr_at_1 value: 17.535999999999998 - type: mrr_at_10 value: 28.53 - type: mrr_at_100 value: 29.697000000000003 - type: mrr_at_1000 value: 29.755 - type: mrr_at_3 value: 24.779999999999998 - type: mrr_at_5 value: 26.942 - type: ndcg_at_1 value: 17.549999999999997 - type: ndcg_at_10 value: 34.514 - type: ndcg_at_100 value: 40.497 - type: ndcg_at_1000 value: 42.17 - type: ndcg_at_3 value: 26.764 - type: ndcg_at_5 value: 30.678 - type: precision_at_1 value: 17.549999999999997 - type: precision_at_10 value: 5.692 - type: precision_at_100 value: 0.8699999999999999 - type: precision_at_1000 value: 0.101 - type: precision_at_3 value: 11.562 - type: precision_at_5 value: 8.917 - type: recall_at_1 value: 17.095 - type: recall_at_10 value: 54.642 - type: recall_at_100 value: 82.652 - type: recall_at_1000 value: 95.555 - type: recall_at_3 value: 33.504 - type: recall_at_5 value: 42.925000000000004 - task: type: Classification dataset: type: mteb/mtop_domain name: MTEB MTOPDomainClassification (en) config: en split: test revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf metrics: - type: accuracy value: 91.75558595531236 - type: f1 value: 91.25979279648296 - task: type: Classification dataset: type: mteb/mtop_intent name: MTEB MTOPIntentClassification (en) config: en split: test revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba metrics: - type: accuracy value: 69.90424076607387 - type: f1 value: 52.067408707562244 - task: type: Classification dataset: type: mteb/amazon_massive_intent name: MTEB MassiveIntentClassification (en) config: en split: test revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 metrics: - type: accuracy value: 70.13449899125757 - type: f1 value: 67.62456762910598 - task: type: Classification dataset: type: mteb/amazon_massive_scenario name: MTEB MassiveScenarioClassification (en) config: en split: test revision: 7d571f92784cd94a019292a1f45445077d0ef634 metrics: - type: accuracy value: 74.862138533961 - type: f1 value: 74.66457222091381 - task: type: Clustering dataset: type: mteb/medrxiv-clustering-p2p name: MTEB MedrxivClusteringP2P config: default split: test revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73 metrics: - type: v_measure value: 34.10761942610792 - task: type: Clustering dataset: type: mteb/medrxiv-clustering-s2s name: MTEB MedrxivClusteringS2S config: default split: test revision: 35191c8c0dca72d8ff3efcd72aa802307d469663 metrics: - type: v_measure value: 31.673172170578408 - task: type: Reranking dataset: type: mteb/mind_small name: MTEB MindSmallReranking config: default split: test revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69 metrics: - type: map value: 32.058704977250315 - type: mrr value: 33.24327760839221 - task: type: Retrieval dataset: type: nfcorpus name: MTEB NFCorpus config: default split: test revision: None metrics: - type: map_at_1 value: 5.163 - type: map_at_10 value: 11.652999999999999 - type: map_at_100 value: 14.849 - type: map_at_1000 value: 16.253999999999998 - type: map_at_3 value: 8.616999999999999 - type: map_at_5 value: 10.100000000000001 - type: mrr_at_1 value: 44.272 - type: mrr_at_10 value: 52.25 - type: mrr_at_100 value: 52.761 - type: mrr_at_1000 value: 52.811 - type: mrr_at_3 value: 50.31 - type: mrr_at_5 value: 51.347 - type: ndcg_at_1 value: 42.105 - type: ndcg_at_10 value: 32.044 - type: ndcg_at_100 value: 29.763 - type: ndcg_at_1000 value: 38.585 - type: ndcg_at_3 value: 36.868 - type: ndcg_at_5 value: 35.154999999999994 - type: precision_at_1 value: 43.653 - type: precision_at_10 value: 23.622 - type: precision_at_100 value: 7.7490000000000006 - type: precision_at_1000 value: 2.054 - type: precision_at_3 value: 34.262 - type: precision_at_5 value: 30.154999999999998 - type: recall_at_1 value: 5.163 - type: recall_at_10 value: 15.478 - type: recall_at_100 value: 30.424 - type: recall_at_1000 value: 62.67 - type: recall_at_3 value: 9.615 - type: recall_at_5 value: 12.369 - task: type: Retrieval dataset: type: nq name: MTEB NQ config: default split: test revision: None metrics: - type: map_at_1 value: 21.618000000000002 - type: map_at_10 value: 35.465 - type: map_at_100 value: 36.712 - type: map_at_1000 value: 36.757 - type: map_at_3 value: 31.189 - type: map_at_5 value: 33.537 - type: mrr_at_1 value: 24.305 - type: mrr_at_10 value: 37.653 - type: mrr_at_100 value: 38.662 - type: mrr_at_1000 value: 38.694 - type: mrr_at_3 value: 33.889 - type: mrr_at_5 value: 35.979 - type: ndcg_at_1 value: 24.305 - type: ndcg_at_10 value: 43.028 - type: ndcg_at_100 value: 48.653999999999996 - type: ndcg_at_1000 value: 49.733 - type: ndcg_at_3 value: 34.768 - type: ndcg_at_5 value: 38.753 - type: precision_at_1 value: 24.305 - type: precision_at_10 value: 7.59 - type: precision_at_100 value: 1.076 - type: precision_at_1000 value: 0.11800000000000001 - type: precision_at_3 value: 16.271 - type: precision_at_5 value: 12.068 - type: recall_at_1 value: 21.618000000000002 - type: recall_at_10 value: 63.977 - type: recall_at_100 value: 89.03999999999999 - type: recall_at_1000 value: 97.10600000000001 - type: recall_at_3 value: 42.422 - type: recall_at_5 value: 51.629000000000005 - task: type: Retrieval dataset: type: quora name: MTEB QuoraRetrieval config: default split: test revision: None metrics: - type: map_at_1 value: 69.405 - type: map_at_10 value: 83.05 - type: map_at_100 value: 83.684 - type: map_at_1000 value: 83.70400000000001 - type: map_at_3 value: 80.08800000000001 - type: map_at_5 value: 81.937 - type: mrr_at_1 value: 79.85 - type: mrr_at_10 value: 86.369 - type: mrr_at_100 value: 86.48599999999999 - type: mrr_at_1000 value: 86.48700000000001 - type: mrr_at_3 value: 85.315 - type: mrr_at_5 value: 86.044 - type: ndcg_at_1 value: 79.86999999999999 - type: ndcg_at_10 value: 87.04499999999999 - type: ndcg_at_100 value: 88.373 - type: ndcg_at_1000 value: 88.531 - type: ndcg_at_3 value: 84.04 - type: ndcg_at_5 value: 85.684 - type: precision_at_1 value: 79.86999999999999 - type: precision_at_10 value: 13.183 - type: precision_at_100 value: 1.51 - type: precision_at_1000 value: 0.156 - type: precision_at_3 value: 36.67 - type: precision_at_5 value: 24.12 - type: recall_at_1 value: 69.405 - type: recall_at_10 value: 94.634 - type: recall_at_100 value: 99.214 - type: recall_at_1000 value: 99.958 - type: recall_at_3 value: 85.992 - type: recall_at_5 value: 90.656 - task: type: Clustering dataset: type: mteb/reddit-clustering name: MTEB RedditClustering config: default split: test revision: 24640382cdbf8abc73003fb0fa6d111a705499eb metrics: - type: v_measure value: 50.191676323145465 - task: type: Clustering dataset: type: mteb/reddit-clustering-p2p name: MTEB RedditClusteringP2P config: default split: test revision: 282350215ef01743dc01b456c7f5241fa8937f16 metrics: - type: v_measure value: 56.4874020363744 - task: type: Retrieval dataset: type: scidocs name: MTEB SCIDOCS config: default split: test revision: None metrics: - type: map_at_1 value: 4.228 - type: map_at_10 value: 11.245 - type: map_at_100 value: 13.353000000000002 - type: map_at_1000 value: 13.665 - type: map_at_3 value: 7.779999999999999 - type: map_at_5 value: 9.405 - type: mrr_at_1 value: 20.9 - type: mrr_at_10 value: 31.657999999999998 - type: mrr_at_100 value: 32.769999999999996 - type: mrr_at_1000 value: 32.833 - type: mrr_at_3 value: 28.333000000000002 - type: mrr_at_5 value: 30.043 - type: ndcg_at_1 value: 20.9 - type: ndcg_at_10 value: 19.073 - type: ndcg_at_100 value: 27.055 - type: ndcg_at_1000 value: 32.641 - type: ndcg_at_3 value: 17.483999999999998 - type: ndcg_at_5 value: 15.42 - type: precision_at_1 value: 20.9 - type: precision_at_10 value: 10.17 - type: precision_at_100 value: 2.162 - type: precision_at_1000 value: 0.35100000000000003 - type: precision_at_3 value: 16.467000000000002 - type: precision_at_5 value: 13.68 - type: recall_at_1 value: 4.228 - type: recall_at_10 value: 20.573 - type: recall_at_100 value: 43.887 - type: recall_at_1000 value: 71.22 - type: recall_at_3 value: 10.023 - type: recall_at_5 value: 13.873 - task: type: STS dataset: type: mteb/sickr-sts name: MTEB SICK-R config: default split: test revision: a6ea5a8cab320b040a23452cc28066d9beae2cee metrics: - type: cos_sim_pearson value: 82.77965135067481 - type: cos_sim_spearman value: 75.85121335808076 - type: euclidean_pearson value: 80.09115175262697 - type: euclidean_spearman value: 75.72249155647123 - type: manhattan_pearson value: 79.89723577351782 - type: manhattan_spearman value: 75.49855259442387 - task: type: STS dataset: type: mteb/sts12-sts name: MTEB STS12 config: default split: test revision: a0d554a64d88156834ff5ae9920b964011b16384 metrics: - type: cos_sim_pearson value: 80.46084116030949 - type: cos_sim_spearman value: 72.57579204392951 - type: euclidean_pearson value: 76.39020830763684 - type: euclidean_spearman value: 72.3718627025895 - type: manhattan_pearson value: 76.6148833027359 - type: manhattan_spearman value: 72.57570008442319 - task: type: STS dataset: type: mteb/sts13-sts name: MTEB STS13 config: default split: test revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca metrics: - type: cos_sim_pearson value: 80.43678068337017 - type: cos_sim_spearman value: 82.38941154076062 - type: euclidean_pearson value: 81.59260573633661 - type: euclidean_spearman value: 82.31144262574114 - type: manhattan_pearson value: 81.43266909137056 - type: manhattan_spearman value: 82.14704293004861 - task: type: STS dataset: type: mteb/sts14-sts name: MTEB STS14 config: default split: test revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375 metrics: - type: cos_sim_pearson value: 80.73713431763163 - type: cos_sim_spearman value: 77.97860512809388 - type: euclidean_pearson value: 80.35755041527027 - type: euclidean_spearman value: 78.021703511412 - type: manhattan_pearson value: 80.24440317109162 - type: manhattan_spearman value: 77.93165415697575 - task: type: STS dataset: type: mteb/sts15-sts name: MTEB STS15 config: default split: test revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3 metrics: - type: cos_sim_pearson value: 85.15111852351204 - type: cos_sim_spearman value: 86.54032447238258 - type: euclidean_pearson value: 86.14157021537433 - type: euclidean_spearman value: 86.67537291929713 - type: manhattan_pearson value: 86.081041854808 - type: manhattan_spearman value: 86.61561701560558 - task: type: STS dataset: type: mteb/sts16-sts name: MTEB STS16 config: default split: test revision: 4d8694f8f0e0100860b497b999b3dbed754a0513 metrics: - type: cos_sim_pearson value: 81.34532445104026 - type: cos_sim_spearman value: 83.31325001474116 - type: euclidean_pearson value: 82.81892375201032 - type: euclidean_spearman value: 83.4521695148055 - type: manhattan_pearson value: 82.72503790526163 - type: manhattan_spearman value: 83.37833652941349 - task: type: STS dataset: type: mteb/sts17-crosslingual-sts name: MTEB STS17 (en-en) config: en-en split: test revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d metrics: - type: cos_sim_pearson value: 87.25463453839801 - type: cos_sim_spearman value: 88.27655263515948 - type: euclidean_pearson value: 88.0248334411439 - type: euclidean_spearman value: 88.18141448876868 - type: manhattan_pearson value: 87.8080451127279 - type: manhattan_spearman value: 88.01028114423058 - task: type: STS dataset: type: mteb/sts22-crosslingual-sts name: MTEB STS22 (en) config: en split: test revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80 metrics: - type: cos_sim_pearson value: 63.57551045355218 - type: cos_sim_spearman value: 66.67614095126629 - type: euclidean_pearson value: 66.0787243112528 - type: euclidean_spearman value: 66.83660560636939 - type: manhattan_pearson value: 66.74684019662031 - type: manhattan_spearman value: 67.11761598074368 - task: type: STS dataset: type: mteb/stsbenchmark-sts name: MTEB STSBenchmark config: default split: test revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831 metrics: - type: cos_sim_pearson value: 83.70881496766829 - type: cos_sim_spearman value: 84.37803542941634 - type: euclidean_pearson value: 84.84501245857096 - type: euclidean_spearman value: 84.47088079741476 - type: manhattan_pearson value: 84.77244090794765 - type: manhattan_spearman value: 84.43307343706205 - task: type: Reranking dataset: type: mteb/scidocs-reranking name: MTEB SciDocsRR config: default split: test revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab metrics: - type: map value: 81.53946254759089 - type: mrr value: 94.68259953554072 - task: type: Retrieval dataset: type: scifact name: MTEB SciFact config: default split: test revision: None metrics: - type: map_at_1 value: 51.817 - type: map_at_10 value: 62.339999999999996 - type: map_at_100 value: 62.88 - type: map_at_1000 value: 62.909000000000006 - type: map_at_3 value: 59.004 - type: map_at_5 value: 60.906000000000006 - type: mrr_at_1 value: 54.333 - type: mrr_at_10 value: 63.649 - type: mrr_at_100 value: 64.01 - type: mrr_at_1000 value: 64.039 - type: mrr_at_3 value: 61.056 - type: mrr_at_5 value: 62.639 - type: ndcg_at_1 value: 54.333 - type: ndcg_at_10 value: 67.509 - type: ndcg_at_100 value: 69.69999999999999 - type: ndcg_at_1000 value: 70.613 - type: ndcg_at_3 value: 61.729 - type: ndcg_at_5 value: 64.696 - type: precision_at_1 value: 54.333 - type: precision_at_10 value: 9.2 - type: precision_at_100 value: 1.043 - type: precision_at_1000 value: 0.11199999999999999 - type: precision_at_3 value: 24.0 - type: precision_at_5 value: 16.2 - type: recall_at_1 value: 51.817 - type: recall_at_10 value: 82.056 - type: recall_at_100 value: 91.667 - type: recall_at_1000 value: 99.0 - type: recall_at_3 value: 66.717 - type: recall_at_5 value: 74.17200000000001 - task: type: PairClassification dataset: type: mteb/sprintduplicatequestions-pairclassification name: MTEB SprintDuplicateQuestions config: default split: test revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46 metrics: - type: cos_sim_accuracy value: 99.82475247524752 - type: cos_sim_ap value: 95.4781199603258 - type: cos_sim_f1 value: 91.16186693147964 - type: cos_sim_precision value: 90.53254437869822 - type: cos_sim_recall value: 91.8 - type: dot_accuracy value: 99.75049504950495 - type: dot_ap value: 93.05183539809457 - type: dot_f1 value: 87.31117824773412 - type: dot_precision value: 87.93103448275862 - type: dot_recall value: 86.7 - type: euclidean_accuracy value: 99.82475247524752 - type: euclidean_ap value: 95.38547978154382 - type: euclidean_f1 value: 91.16325511732403 - type: euclidean_precision value: 91.02691924227318 - type: euclidean_recall value: 91.3 - type: manhattan_accuracy value: 99.82574257425742 - type: manhattan_ap value: 95.47237521890308 - type: manhattan_f1 value: 91.27849355797821 - type: manhattan_precision value: 90.47151277013754 - type: manhattan_recall value: 92.10000000000001 - type: max_accuracy value: 99.82574257425742 - type: max_ap value: 95.4781199603258 - type: max_f1 value: 91.27849355797821 - task: type: Clustering dataset: type: mteb/stackexchange-clustering name: MTEB StackExchangeClustering config: default split: test revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259 metrics: - type: v_measure value: 57.542169376331245 - task: type: Clustering dataset: type: mteb/stackexchange-clustering-p2p name: MTEB StackExchangeClusteringP2P config: default split: test revision: 815ca46b2622cec33ccafc3735d572c266efdb44 metrics: - type: v_measure value: 35.74399302634387 - task: type: Reranking dataset: type: mteb/stackoverflowdupquestions-reranking name: MTEB StackOverflowDupQuestions config: default split: test revision: e185fbe320c72810689fc5848eb6114e1ef5ec69 metrics: - type: map value: 49.65076347632749 - type: mrr value: 50.418099057804945 - task: type: Summarization dataset: type: mteb/summeval name: MTEB SummEval config: default split: test revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c metrics: - type: cos_sim_pearson value: 29.73997756592847 - type: cos_sim_spearman value: 29.465208011593308 - type: dot_pearson value: 24.83735342474541 - type: dot_spearman value: 26.005180528584855 - task: type: Retrieval dataset: type: trec-covid name: MTEB TRECCOVID config: default split: test revision: None metrics: - type: map_at_1 value: 0.208 - type: map_at_10 value: 1.434 - type: map_at_100 value: 7.829 - type: map_at_1000 value: 19.807 - type: map_at_3 value: 0.549 - type: map_at_5 value: 0.8330000000000001 - type: mrr_at_1 value: 78.0 - type: mrr_at_10 value: 85.35199999999999 - type: mrr_at_100 value: 85.673 - type: mrr_at_1000 value: 85.673 - type: mrr_at_3 value: 84.667 - type: mrr_at_5 value: 85.06700000000001 - type: ndcg_at_1 value: 72.0 - type: ndcg_at_10 value: 59.214999999999996 - type: ndcg_at_100 value: 44.681 - type: ndcg_at_1000 value: 43.035000000000004 - type: ndcg_at_3 value: 66.53099999999999 - type: ndcg_at_5 value: 63.23 - type: precision_at_1 value: 78.0 - type: precision_at_10 value: 62.4 - type: precision_at_100 value: 45.76 - type: precision_at_1000 value: 19.05 - type: precision_at_3 value: 71.333 - type: precision_at_5 value: 67.2 - type: recall_at_1 value: 0.208 - type: recall_at_10 value: 1.6580000000000001 - type: recall_at_100 value: 11.324 - type: recall_at_1000 value: 41.537 - type: recall_at_3 value: 0.579 - type: recall_at_5 value: 0.8959999999999999 - task: type: Retrieval dataset: type: webis-touche2020 name: MTEB Touche2020 config: default split: test revision: None metrics: - type: map_at_1 value: 2.442 - type: map_at_10 value: 8.863 - type: map_at_100 value: 14.606 - type: map_at_1000 value: 16.258 - type: map_at_3 value: 4.396 - type: map_at_5 value: 6.199000000000001 - type: mrr_at_1 value: 30.612000000000002 - type: mrr_at_10 value: 43.492 - type: mrr_at_100 value: 44.557 - type: mrr_at_1000 value: 44.557 - type: mrr_at_3 value: 40.816 - type: mrr_at_5 value: 42.143 - type: ndcg_at_1 value: 25.509999999999998 - type: ndcg_at_10 value: 22.076 - type: ndcg_at_100 value: 34.098 - type: ndcg_at_1000 value: 46.265 - type: ndcg_at_3 value: 24.19 - type: ndcg_at_5 value: 23.474 - type: precision_at_1 value: 30.612000000000002 - type: precision_at_10 value: 19.796 - type: precision_at_100 value: 7.286 - type: precision_at_1000 value: 1.5310000000000001 - type: precision_at_3 value: 25.85 - type: precision_at_5 value: 24.490000000000002 - type: recall_at_1 value: 2.442 - type: recall_at_10 value: 15.012 - type: recall_at_100 value: 45.865 - type: recall_at_1000 value: 82.958 - type: recall_at_3 value: 5.731 - type: recall_at_5 value: 9.301 - task: type: Classification dataset: type: mteb/toxic_conversations_50k name: MTEB ToxicConversationsClassification config: default split: test revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c metrics: - type: accuracy value: 70.974 - type: ap value: 14.534996211286682 - type: f1 value: 54.785946183399005 - task: type: Classification dataset: type: mteb/tweet_sentiment_extraction name: MTEB TweetSentimentExtractionClassification config: default split: test revision: d604517c81ca91fe16a244d1248fc021f9ecee7a metrics: - type: accuracy value: 58.56819468024901 - type: f1 value: 58.92391487111204 - task: type: Clustering dataset: type: mteb/twentynewsgroups-clustering name: MTEB TwentyNewsgroupsClustering config: default split: test revision: 6125ec4e24fa026cec8a478383ee943acfbd5449 metrics: - type: v_measure value: 43.273202335218194 - task: type: PairClassification dataset: type: mteb/twittersemeval2015-pairclassification name: MTEB TwitterSemEval2015 config: default split: test revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1 metrics: - type: cos_sim_accuracy value: 84.37742146986946 - type: cos_sim_ap value: 68.1684129575579 - type: cos_sim_f1 value: 64.93475108748189 - type: cos_sim_precision value: 59.89745876058849 - type: cos_sim_recall value: 70.89709762532982 - type: dot_accuracy value: 80.49710913750968 - type: dot_ap value: 54.699790073944186 - type: dot_f1 value: 54.45130013221684 - type: dot_precision value: 46.74612183125236 - type: dot_recall value: 65.19788918205805 - type: euclidean_accuracy value: 84.5085533766466 - type: euclidean_ap value: 68.38835695236224 - type: euclidean_f1 value: 65.3391121002694 - type: euclidean_precision value: 58.75289656625237 - type: euclidean_recall value: 73.58839050131925 - type: manhattan_accuracy value: 84.40126363473803 - type: manhattan_ap value: 68.09539181555348 - type: manhattan_f1 value: 64.99028182701653 - type: manhattan_precision value: 60.22062134173795 - type: manhattan_recall value: 70.58047493403694 - type: max_accuracy value: 84.5085533766466 - type: max_ap value: 68.38835695236224 - type: max_f1 value: 65.3391121002694 - task: type: PairClassification dataset: type: mteb/twitterurlcorpus-pairclassification name: MTEB TwitterURLCorpus config: default split: test revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf metrics: - type: cos_sim_accuracy value: 88.34167733923235 - type: cos_sim_ap value: 84.84136381147736 - type: cos_sim_f1 value: 77.01434980904001 - type: cos_sim_precision value: 74.27937915742794 - type: cos_sim_recall value: 79.95842315983985 - type: dot_accuracy value: 85.06422944075756 - type: dot_ap value: 76.49446747522325 - type: dot_f1 value: 71.11606520830432 - type: dot_precision value: 64.93638676844785 - type: dot_recall value: 78.59562673236834 - type: euclidean_accuracy value: 88.45810532852097 - type: euclidean_ap value: 84.91526721863501 - type: euclidean_f1 value: 77.04399001750662 - type: euclidean_precision value: 74.62298867162133 - type: euclidean_recall value: 79.62734832152756 - type: manhattan_accuracy value: 88.46004579500912 - type: manhattan_ap value: 84.81590026238194 - type: manhattan_f1 value: 76.97804626491822 - type: manhattan_precision value: 73.79237288135593 - type: manhattan_recall value: 80.45118570988605 - type: max_accuracy value: 88.46004579500912 - type: max_ap value: 84.91526721863501 - type: max_f1 value: 77.04399001750662 pipeline_tag: sentence-similarity tags: - sentence-transformers - feature-extraction - sentence-similarity - transformers - mteb --- # {gte-tiny} This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. It is distilled from , with comparable (slightly worse) performance at around half the size. <!--- Describe your model here --> ## Usage (Sentence-Transformers) Using this model becomes easy when you have sentence-transformers installed: Then you can use the model like this: ## Usage (HuggingFace Transformers) Without sentence-transformers, you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. ## Evaluation Results <!--- Describe how your model was evaluated --> For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: ## Full Model Architecture ## Citing & Authors <!--- Describe where people can find more information -->",
21
+ "model_explanation_gemini": "\"TaylorAI_gte-tiny performs text classification, retrieval, clustering, reranking, and semantic textual similarity (STS) tasks, achieving competitive metrics across multiple datasets like MTEB Amazon reviews, banking queries, and scientific text clustering.\"\n\n**Model Features**: \n- **Tasks**: Classification, Retrieval, Clustering, Reranking, STS \n- **Datasets**: Amazon reviews (counterfactual/polarity), Banking77, ArXiv/BioRxiv clustering, BIOSSES ("
22
+ }
model_data_json/TheBloke_Llama-2-7B-Chat-GGUF.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TheBloke/Llama-2-7B-Chat-GGUF",
3
+ "downloads": 83135,
4
+ "tags": [
5
+ "transformers",
6
+ "gguf",
7
+ "llama",
8
+ "facebook",
9
+ "meta",
10
+ "pytorch",
11
+ "llama-2",
12
+ "text-generation",
13
+ "en",
14
+ "arxiv:2307.09288",
15
+ "base_model:meta-llama/Llama-2-7b-chat-hf",
16
+ "base_model:quantized:meta-llama/Llama-2-7b-chat-hf",
17
+ "license:llama2",
18
+ "region:us"
19
+ ],
20
+ "description": "--- language: - en license: llama2 tags: - facebook - meta - pytorch - llama - llama-2 model_name: Llama 2 7B Chat arxiv: 2307.09288 base_model: meta-llama/Llama-2-7b-chat-hf inference: false model_creator: Meta Llama 2 model_type: llama pipeline_tag: text-generation prompt_template: '[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don''t know the answer to a question, please don''t share false information. <</SYS>> {prompt}[/INST] ' quantized_by: TheBloke --- <!-- header start --> <!-- 200823 --> <div style=\"width: auto; margin-left: auto; margin-right: auto\"> <img src=\" alt=\"TheBlokeAI\" style=\"width: 100%; min-width: 400px; display: block; margin: auto;\"> </div> <div style=\"display: flex; justify-content: space-between; width: 100%;\"> <div style=\"display: flex; flex-direction: column; align-items: flex-start;\"> <p style=\"margin-top: 0.5em; margin-bottom: 0em;\"><a href=\" & support: TheBloke's Discord server</a></p> </div> <div style=\"display: flex; flex-direction: column; align-items: flex-end;\"> <p style=\"margin-top: 0.5em; margin-bottom: 0em;\"><a href=\" to contribute? TheBloke's Patreon page</a></p> </div> </div> <div style=\"text-align:center; margin-top: 0em; margin-bottom: 0em\"><p style=\"margin-top: 0.25em; margin-bottom: 0em;\">TheBloke's LLM work is generously supported by a grant from <a href=\" horowitz (a16z)</a></p></div> <hr style=\"margin-top: 1.0em; margin-bottom: 1.0em;\"> <!-- header end --> # Llama 2 7B Chat - GGUF - Model creator: Meta Llama 2 - Original model: Llama 2 7B Chat <!-- description start --> ## Description This repo contains GGUF format model files for Meta Llama 2's Llama 2 7B Chat. <!-- description end --> <!-- README_GGUF.md-about-gguf start --> ### About GGUF GGUF is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp. GGUF offers numerous advantages over GGML, such as better tokenisation, and support for special tokens. It is also supports metadata, and is designed to be extensible. Here is an incomplate list of clients and libraries that are known to support GGUF: * llama.cpp. The source project for GGUF. Offers a CLI and a server option. * text-generation-webui, the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration. * KoboldCpp, a fully featured web UI, with GPU accel across all platforms and GPU architectures. Especially good for story telling. * LM Studio, an easy-to-use and powerful local GUI for Windows and macOS (Silicon), with GPU acceleration. * LoLLMS Web UI, a great web UI with many interesting and unique features, including a full model library for easy model selection. * Faraday.dev, an attractive and easy to use character-based chat GUI for Windows and macOS (both Silicon and Intel), with GPU acceleration. * ctransformers, a Python library with GPU accel, LangChain support, and OpenAI-compatible AI server. * llama-cpp-python, a Python library with GPU accel, LangChain support, and OpenAI-compatible API server. * candle, a Rust ML framework with a focus on performance, including GPU support, and ease of use. <!-- README_GGUF.md-about-gguf end --> <!-- repositories-available start --> ## Repositories available * AWQ model(s) for GPU inference. * GPTQ models for GPU inference, with multiple quantisation parameter options. * 2, 3, 4, 5, 6 and 8-bit GGUF models for CPU+GPU inference * Meta Llama 2's original unquantised fp16 model in pytorch format, for GPU inference and for further conversions <!-- repositories-available end --> <!-- prompt-template start --> ## Prompt template: Llama-2-Chat <!-- prompt-template end --> <!-- compatibility_gguf start --> ## Compatibility These quantised GGUFv2 files are compatible with llama.cpp from August 27th onwards, as of commit d0cee0d36d5be95a0d9088b674dbb27354107221 They are also compatible with many third party UIs and libraries - please see the list at the top of this README. ## Explanation of quantisation methods <details> <summary>Click to see details</summary> The new methods available are: * GGML_TYPE_Q2_K - \"type-1\" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw) * GGML_TYPE_Q3_K - \"type-0\" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw. * GGML_TYPE_Q4_K - \"type-1\" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw. * GGML_TYPE_Q5_K - \"type-1\" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw * GGML_TYPE_Q6_K - \"type-0\" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw Refer to the Provided Files table below to see what files use which methods, and how. </details> <!-- compatibility_gguf end --> <!-- README_GGUF.md-provided-files start --> ## Provided files | Name | Quant method | Bits | Size | Max RAM required | Use case | | ---- | ---- | ---- | ---- | ---- | ----- | | llama-2-7b-chat.Q2_K.gguf | Q2_K | 2 | 2.83 GB| 5.33 GB | smallest, significant quality loss - not recommended for most purposes | | llama-2-7b-chat.Q3_K_S.gguf | Q3_K_S | 3 | 2.95 GB| 5.45 GB | very small, high quality loss | | llama-2-7b-chat.Q3_K_M.gguf | Q3_K_M | 3 | 3.30 GB| 5.80 GB | very small, high quality loss | | llama-2-7b-chat.Q3_K_L.gguf | Q3_K_L | 3 | 3.60 GB| 6.10 GB | small, substantial quality loss | | llama-2-7b-chat.Q4_0.gguf | Q4_0 | 4 | 3.83 GB| 6.33 GB | legacy; small, very high quality loss - prefer using Q3_K_M | | llama-2-7b-chat.Q4_K_S.gguf | Q4_K_S | 4 | 3.86 GB| 6.36 GB | small, greater quality loss | | llama-2-7b-chat.Q4_K_M.gguf | Q4_K_M | 4 | 4.08 GB| 6.58 GB | medium, balanced quality - recommended | | llama-2-7b-chat.Q5_0.gguf | Q5_0 | 5 | 4.65 GB| 7.15 GB | legacy; medium, balanced quality - prefer using Q4_K_M | | llama-2-7b-chat.Q5_K_S.gguf | Q5_K_S | 5 | 4.65 GB| 7.15 GB | large, low quality loss - recommended | | llama-2-7b-chat.Q5_K_M.gguf | Q5_K_M | 5 | 4.78 GB| 7.28 GB | large, very low quality loss - recommended | | llama-2-7b-chat.Q6_K.gguf | Q6_K | 6 | 5.53 GB| 8.03 GB | very large, extremely low quality loss | | llama-2-7b-chat.Q8_0.gguf | Q8_0 | 8 | 7.16 GB| 9.66 GB | very large, extremely low quality loss - not recommended | **Note**: the above RAM figures assume no GPU offloading. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead. <!-- README_GGUF.md-provided-files end --> <!-- README_GGUF.md-how-to-download start --> ## How to download GGUF files **Note for manual downloaders:** You almost never want to clone the entire repo! Multiple different quantisation formats are provided, and most users only want to pick and download a single file. The following clients/libraries will automatically download models for you, providing a list of available models to choose from: - LM Studio - LoLLMS Web UI - Faraday.dev ### In Under Download Model, you can enter the model repo: TheBloke/Llama-2-7b-Chat-GGUF and below it, a specific filename to download, such as: llama-2-7b-chat.q4_K_M.gguf. Then click Download. ### On the command line, including multiple files at once I recommend using the Python library: Then you can download any individual model file to the current directory, at high speed, with a command like this: <details> <summary>More advanced huggingface-cli download usage</summary> You can also download multiple files at once with a pattern: For more documentation on downloading with , please see: HF -> Hub Python Library -> Download files -> Download from the CLI. To accelerate downloads on fast connections (1Gbit/s or higher), install : And set environment variable to : Windows CLI users: Use before running the download command. </details> <!-- README_GGUF.md-how-to-download end --> <!-- README_GGUF.md-how-to-run start --> ## Example command Make sure you are using from commit d0cee0d36d5be95a0d9088b674dbb27354107221 or later. Change to the number of layers to offload to GPU. Remove it if you don't have GPU acceleration. Change to the desired sequence length. For extended sequence models - eg 8K, 16K, 32K - the necessary RoPE scaling parameters are read from the GGUF file and set by llama.cpp automatically. If you want to have a chat-style conversation, replace the argument with For other parameters and how to use them, please refer to the llama.cpp documentation ## How to run in Further instructions here: text-generation-webui/docs/llama.cpp.md. ## How to run from Python code You can use GGUF models from Python using the llama-cpp-python or ctransformers libraries. ### How to load this model from Python using ctransformers #### First install the package #### Simple example code to load one of these GGUF models ## How to use with LangChain Here's guides on using llama-cpp-python or ctransformers with LangChain: * LangChain + llama-cpp-python * LangChain + ctransformers <!-- README_GGUF.md-how-to-run end --> <!-- footer start --> <!-- 200823 --> ## Discord For further support, and discussions on these models and AI in general, join us at: TheBloke AI's Discord server ## Thanks, and how to contribute Thanks to the chirper.ai team! Thanks to Clay from gpus.llm-utils.org! I've had a lot of people ask if they can contribute. I enjoy providing models and helping people, and would love to be able to spend even more time doing it, as well as expanding into new projects like fine tuning/training. If you're able and willing to contribute it will be most gratefully received and will help me to keep providing more models, and to start work on new AI projects. Donaters will get priority support on any and all AI/LLM/model questions and requests, access to a private Discord room, plus other benefits. * Patreon: * Ko-Fi: **Special thanks to**: Aemon Algiz. **Patreon special mentions**: Alicia Loh, Stephen Murray, K, Ajan Kanaga, RoA, Magnesian, Deo Leter, Olakabola, Eugene Pentland, zynix, Deep Realms, Raymond Fosdick, Elijah Stavena, Iucharbius, Erik Bjäreholt, Luis Javier Navarrete Lozano, Nicholas, theTransient, John Detwiler, alfie_i, knownsqashed, Mano Prime, Willem Michiel, Enrico Ros, LangChain4j, OG, Michael Dempsey, Pierre Kircher, Pedro Madruga, James Bentley, Thomas Belote, Luke @flexchar, Leonard Tan, Johann-Peter Hartmann, Illia Dulskyi, Fen Risland, Chadd, S_X, Jeff Scroggin, Ken Nordquist, Sean Connelly, Artur Olbinski, Swaroop Kallakuri, Jack West, Ai Maven, David Ziegler, Russ Johnson, transmissions 11, John Villwock, Alps Aficionado, Clay Pascal, Viktor Bowallius, Subspace Studios, Rainer Wilmers, Trenton Dambrowitz, vamX, Michael Levine, 준교 김, Brandon Frisco, Kalila, Trailburnt, Randy H, Talal Aujan, Nathan Dryer, Vadim, 阿明, ReadyPlayerEmma, Tiffany J. Kim, George Stoitzev, Spencer Kim, Jerry Meng, Gabriel Tamborski, Cory Kujawski, Jeffrey Morgan, Spiking Neurons AB, Edmond Seymore, Alexandros Triantafyllidis, Lone Striker, Cap'n Zoog, Nikolai Manek, danny, ya boyyy, Derek Yates, usrbinkat, Mandus, TL, Nathan LeClaire, subjectnull, Imad Khwaja, webtim, Raven Klaugh, Asp the Wyvern, Gabriel Puliatti, Caitlyn Gatomon, Joseph William Delisle, Jonathan Leane, Luke Pendergrass, SuperWojo, Sebastain Graf, Will Dee, Fred von Graf, Andrey, Dan Guido, Daniel P. Andersen, Nitin Borwankar, Elle, Vitor Caleffi, biorpg, jjj, NimbleBox.ai, Pieter, Matthew Berman, terasurfer, Michael Davis, Alex, Stanislav Ovsiannikov Thank you to all my generous patrons and donaters! And thank you again to a16z for their generous grant. <!-- footer end --> <!-- original-model-card start --> # Original model card: Meta Llama 2's Llama 2 7B Chat # **Llama 2** Llama 2 is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. This is the repository for the 7B fine-tuned model, optimized for dialogue use cases and converted for the Hugging Face Transformers format. Links to other models can be found in the index at the bottom. ## Model Details *Note: Use of this model is governed by the Meta license. In order to download the model weights and tokenizer, please visit the website and accept our License before requesting access here.* Meta developed and publicly released the Llama 2 family of large language models (LLMs), a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama-2-Chat, are optimized for dialogue use cases. Llama-2-Chat models outperform open-source chat models on most benchmarks we tested, and in our human evaluations for helpfulness and safety, are on par with some popular closed-source models like ChatGPT and PaLM. **Model Developers** Meta **Variations** Llama 2 comes in a range of parameter sizes — 7B, 13B, and 70B — as well as pretrained and fine-tuned variations. **Input** Models input text only. **Output** Models generate text only. **Model Architecture** Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align to human preferences for helpfulness and safety. ||Training Data|Params|Content Length|GQA|Tokens|LR| |---|---|---|---|---|---|---| |Llama 2|*A new mix of publicly available online data*|7B|4k|&#10007;|2.0T|3.0 x 10<sup>-4</sup>| |Llama 2|*A new mix of publicly available online data*|13B|4k|&#10007;|2.0T|3.0 x 10<sup>-4</sup>| |Llama 2|*A new mix of publicly available online data*|70B|4k|&#10004;|2.0T|1.5 x 10<sup>-4</sup>| *Llama 2 family of models.* Token counts refer to pretraining data only. All models are trained with a global batch-size of 4M tokens. Bigger models - 70B -- use Grouped-Query Attention (GQA) for improved inference scalability. **Model Dates** Llama 2 was trained between January 2023 and July 2023. **Status** This is a static model trained on an offline dataset. Future versions of the tuned models will be released as we improve model safety with community feedback. **License** A custom commercial license is available at: **Research Paper** \"Llama-2: Open Foundation and Fine-tuned Chat Models\" ## Intended Use **Intended Use Cases** Llama 2 is intended for commercial and research use in English. Tuned models are intended for assistant-like chat, whereas pretrained models can be adapted for a variety of natural language generation tasks. To get the expected features and performance for the chat versions, a specific formatting needs to be followed, including the and tags, and tokens, and the whitespaces and breaklines in between (we recommend calling on inputs to avoid double-spaces). See our reference code in github for details: []( **Out-of-scope Uses** Use in any manner that violates applicable laws or regulations (including trade compliance laws).Use in languages other than English. Use in any other way that is prohibited by the Acceptable Use Policy and Licensing Agreement for Llama 2. ## Hardware and Software **Training Factors** We used custom training libraries, Meta's Research Super Cluster, and production clusters for pretraining. Fine-tuning, annotation, and evaluation were also performed on third-party cloud compute. **Carbon Footprint** Pretraining utilized a cumulative 3.3M GPU hours of computation on hardware of type A100-80GB (TDP of 350-400W). Estimated total emissions were 539 tCO2eq, 100% of which were offset by Meta’s sustainability program. ||Time (GPU hours)|Power Consumption (W)|Carbon Emitted(tCO<sub>2</sub>eq)| |---|---|---|---| |Llama 2 7B|184320|400|31.22| |Llama 2 13B|368640|400|62.44| |Llama 2 70B|1720320|400|291.42| |Total|3311616||539.00| **CO<sub>2</sub> emissions during pretraining.** Time: total GPU time required for training each model. Power Consumption: peak power capacity per GPU device for the GPUs used adjusted for power usage efficiency. 100% of the emissions are directly offset by Meta's sustainability program, and because we are openly releasing these models, the pretraining costs do not need to be incurred by others. ## Training Data **Overview** Llama 2 was pretrained on 2 trillion tokens of data from publicly available sources. The fine-tuning data includes publicly available instruction datasets, as well as over one million new human-annotated examples. Neither the pretraining nor the fine-tuning datasets include Meta user data. **Data Freshness** The pretraining data has a cutoff of September 2022, but some tuning data is more recent, up to July 2023. ## Evaluation Results In this section, we report the results for the Llama 1 and Llama 2 models on standard academic benchmarks.For all the evaluations, we use our internal evaluations library. |Model|Size|Code|Commonsense Reasoning|World Knowledge|Reading Comprehension|Math|MMLU|BBH|AGI Eval| |---|---|---|---|---|---|---|---|---|---| |Llama 1|7B|14.1|60.8|46.2|58.5|6.95|35.1|30.3|23.9| |Llama 1|13B|18.9|66.1|52.6|62.3|10.9|46.9|37.0|33.9| |Llama 1|33B|26.0|70.0|58.4|67.6|21.4|57.8|39.8|41.7| |Llama 1|65B|30.7|70.7|60.5|68.6|30.8|63.4|43.5|47.6| |Llama 2|7B|16.8|63.9|48.9|61.3|14.6|45.3|32.6|29.3| |Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1| |Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**| **Overall performance on grouped academic benchmarks.** *Code:* We report the average pass@1 scores of our models on HumanEval and MBPP. *Commonsense Reasoning:* We report the average of PIQA, SIQA, HellaSwag, WinoGrande, ARC easy and challenge, OpenBookQA, and CommonsenseQA. We report 7-shot results for CommonSenseQA and 0-shot results for all other benchmarks. *World Knowledge:* We evaluate the 5-shot performance on NaturalQuestions and TriviaQA and report the average. *Reading Comprehension:* For reading comprehension, we report the 0-shot average on SQuAD, QuAC, and BoolQ. *MATH:* We report the average of the GSM8K (8 shot) and MATH (4 shot) benchmarks at top 1. |||TruthfulQA|Toxigen| |---|---|---|---| |Llama 1|7B|27.42|23.00| |Llama 1|13B|41.74|23.08| |Llama 1|33B|44.19|22.57| |Llama 1|65B|48.71|21.77| |Llama 2|7B|33.29|**21.25**| |Llama 2|13B|41.86|26.10| |Llama 2|70B|**50.18**|24.60| **Evaluation of pretrained LLMs on automatic safety benchmarks.** For TruthfulQA, we present the percentage of generations that are both truthful and informative (the higher the better). For ToxiGen, we present the percentage of toxic generations (the smaller the better). |||TruthfulQA|Toxigen| |---|---|---|---| |Llama-2-Chat|7B|57.04|**0.00**| |Llama-2-Chat|13B|62.18|**0.00**| |Llama-2-Chat|70B|**64.14**|0.01| **Evaluation of fine-tuned LLMs on different safety datasets.** Same metric definitions as above. ## Ethical Considerations and Limitations Llama 2 is a new technology that carries risks with use. Testing conducted to date has been in English, and has not covered, nor could it cover all scenarios. For these reasons, as with all LLMs, Llama 2’s potential outputs cannot be predicted in advance, and the model may in some instances produce inaccurate, biased or other objectionable responses to user prompts. Therefore, before deploying any applications of Llama 2, developers should perform safety testing and tuning tailored to their specific applications of the model. Please see the Responsible Use Guide available at ## Reporting Issues Please report any software “bug,” or other problems with the models through one of the following means: - Reporting issues with the model: github.com/facebookresearch/llama - Reporting problematic content generated by the model: developers.facebook.com/llama_output_feedback - Reporting bugs and security concerns: facebook.com/whitehat/info ## Llama Model Index |Model|Llama2|Llama2-hf|Llama2-chat|Llama2-chat-hf| |---|---|---|---|---| |7B| Link | Link | Link | Link| |13B| Link | Link | Link | Link| |70B| Link | Link | Link | Link| <!-- original-model-card end -->",
21
+ "model_explanation_gemini": "A 7B-parameter Llama 2-based chat model in GGUF format, optimized for helpful, safe, and honest text generation while avoiding harmful or unethical content.\n\n**Features:** \n- Fine-tuned for chat/assistant tasks \n- Enforces safety and ethical guidelines in responses \n- Quantized in GGUF format for CPU/GPU efficiency \n- Supports multiple inference clients (llama.cpp, KoboldCpp, etc.) \n\n**Comparison:** \nThis GGUF-quant"
22
+ }
model_data_json/TheBloke_Mixtral-8x7B-Instruct-v0.1-AWQ.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
3
+ "downloads": 75286,
4
+ "tags": [
5
+ "transformers",
6
+ "safetensors",
7
+ "mixtral",
8
+ "text-generation",
9
+ "conversational",
10
+ "fr",
11
+ "it",
12
+ "de",
13
+ "es",
14
+ "en",
15
+ "base_model:mistralai/Mixtral-8x7B-Instruct-v0.1",
16
+ "base_model:quantized:mistralai/Mixtral-8x7B-Instruct-v0.1",
17
+ "license:apache-2.0",
18
+ "autotrain_compatible",
19
+ "text-generation-inference",
20
+ "4-bit",
21
+ "awq",
22
+ "region:us"
23
+ ],
24
+ "description": "--- base_model: mistralai/Mixtral-8x7B-Instruct-v0.1 inference: false language: - fr - it - de - es - en license: apache-2.0 model_creator: Mistral AI_ model_name: Mixtral 8X7B Instruct v0.1 model_type: mixtral prompt_template: '[INST] {prompt} [/INST] ' quantized_by: TheBloke widget: - output: text: 'Arr, shiver me timbers! Ye have a llama on yer lawn, ye say? Well, that be a new one for me! Here''s what I''d suggest, arr: 1. Firstly, ensure yer safety. Llamas may look gentle, but they can be protective if they feel threatened. 2. Try to make the area less appealing to the llama. Remove any food sources or water that might be attracting it. 3. Contact local animal control or a wildlife rescue organization. They be the experts and can provide humane ways to remove the llama from yer property. 4. If ye have any experience with animals, you could try to gently herd the llama towards a nearby field or open space. But be careful, arr! Remember, arr, it be important to treat the llama with respect and care. It be a creature just trying to survive, like the rest of us.' text: '[INST] You are a pirate chatbot who always responds with Arr and pirate speak! There''s a llama on my lawn, how can I get rid of him? [/INST]' --- <!-- markdownlint-disable MD041 --> <!-- header start --> <!-- 200823 --> <div style=\"width: auto; margin-left: auto; margin-right: auto\"> <img src=\" alt=\"TheBlokeAI\" style=\"width: 100%; min-width: 400px; display: block; margin: auto;\"> </div> <div style=\"display: flex; justify-content: space-between; width: 100%;\"> <div style=\"display: flex; flex-direction: column; align-items: flex-start;\"> <p style=\"margin-top: 0.5em; margin-bottom: 0em;\"><a href=\" & support: TheBloke's Discord server</a></p> </div> <div style=\"display: flex; flex-direction: column; align-items: flex-end;\"> <p style=\"margin-top: 0.5em; margin-bottom: 0em;\"><a href=\" to contribute? TheBloke's Patreon page</a></p> </div> </div> <div style=\"text-align:center; margin-top: 0em; margin-bottom: 0em\"><p style=\"margin-top: 0.25em; margin-bottom: 0em;\">TheBloke's LLM work is generously supported by a grant from <a href=\" horowitz (a16z)</a></p></div> <hr style=\"margin-top: 1.0em; margin-bottom: 1.0em;\"> <!-- header end --> # Mixtral 8X7B Instruct v0.1 - AWQ - Model creator: Mistral AI_ - Original model: Mixtral 8X7B Instruct v0.1 <!-- description start --> ## Description This repo contains AWQ model files for Mistral AI_'s Mixtral 8X7B Instruct v0.1. ### About AWQ AWQ is an efficient, accurate and blazing-fast low-bit weight quantization method, currently supporting 4-bit quantization. Compared to GPTQ, it offers faster Transformers-based inference with equivalent or better quality compared to the most commonly used GPTQ settings. AWQ models are currently supported on Linux and Windows, with NVidia GPUs only. macOS users: please use GGUF models instead. It is supported by: - Text Generation Webui - using Loader: AutoAWQ - vLLM - version 0.2.2 or later for support for all model types. - Hugging Face Text Generation Inference (TGI) - Transformers version 4.35.0 and later, from any code or client that supports Transformers - AutoAWQ - for use from Python code <!-- description end --> <!-- repositories-available start --> ## Repositories available * AWQ model(s) for GPU inference. * GPTQ models for GPU inference, with multiple quantisation parameter options. * 2, 3, 4, 5, 6 and 8-bit GGUF models for CPU+GPU inference * Mistral AI_'s original unquantised fp16 model in pytorch format, for GPU inference and for further conversions <!-- repositories-available end --> <!-- prompt-template start --> ## Prompt template: Mistral <!-- prompt-template end --> <!-- README_AWQ.md-provided-files start --> ## Provided files, and AWQ parameters I currently release 128g GEMM models only. The addition of group_size 32 models, and GEMV kernel models, is being actively considered. Models are released as sharded safetensors files. | Branch | Bits | GS | AWQ Dataset | Seq Len | Size | | ------ | ---- | -- | ----------- | ------- | ---- | | main | 4 | 128 | VMware Open Instruct | 8192 | 24.65 GB <!-- README_AWQ.md-provided-files end --> <!-- README_AWQ.md-text-generation-webui start --> ## How to easily download and use this model in text-generation-webui Please make sure you're using the latest version of text-generation-webui. It is strongly recommended to use the text-generation-webui one-click-installers unless you're sure you know how to make a manual install. 1. Click the **Model tab**. 2. Under **Download custom model or LoRA**, enter . 3. Click **Download**. 4. The model will start downloading. Once it's finished it will say \"Done\". 5. In the top left, click the refresh icon next to **Model**. 6. In the **Model** dropdown, choose the model you just downloaded: 7. Select **Loader: AutoAWQ**. 8. Click Load, and the model will load and is now ready for use. 9. If you want any custom settings, set them and then click **Save settings for this model** followed by **Reload the Model** in the top right. 10. Once you're ready, click the **Text Generation** tab and enter a prompt to get started! <!-- README_AWQ.md-text-generation-webui end --> <!-- README_AWQ.md-use-from-vllm start --> ## Multi-user inference server: vLLM Documentation on installing and using vLLM can be found here. - Please ensure you are using vLLM version 0.2 or later. - When using vLLM as a server, pass the parameter. For example: - When using vLLM from Python code, again set . For example: <!-- README_AWQ.md-use-from-vllm start --> <!-- README_AWQ.md-use-from-tgi start --> ## Multi-user inference server: Hugging Face Text Generation Inference (TGI) Use TGI version 1.1.0 or later. The official Docker container is: Example Docker parameters: Example Python code for interfacing with TGI (requires huggingface-hub 0.17.0 or later): <!-- README_AWQ.md-use-from-tgi end --> <!-- README_AWQ.md-use-from-python start --> ## Inference from Python code using Transformers ### Install the necessary packages - Requires: Transformers 4.35.0 or later. - Requires: AutoAWQ 0.1.6 or later. Note that if you are using PyTorch 2.0.1, the above AutoAWQ command will automatically upgrade you to PyTorch 2.1.0. If you are using CUDA 11.8 and wish to continue using PyTorch 2.0.1, instead run this command: If you have problems installing AutoAWQ using the pre-built wheels, install it from source instead: ### Transformers example code (requires Transformers 4.35.0 and later) <!-- README_AWQ.md-use-from-python end --> <!-- README_AWQ.md-compatibility start --> ## Compatibility The files provided are tested to work with: - text-generation-webui using . - vLLM version 0.2.0 and later. - Hugging Face Text Generation Inference (TGI) version 1.1.0 and later. - Transformers version 4.35.0 and later. - AutoAWQ version 0.1.1 and later. <!-- README_AWQ.md-compatibility end --> <!-- footer start --> <!-- 200823 --> ## Discord For further support, and discussions on these models and AI in general, join us at: TheBloke AI's Discord server ## Thanks, and how to contribute Thanks to the chirper.ai team! Thanks to Clay from gpus.llm-utils.org! I've had a lot of people ask if they can contribute. I enjoy providing models and helping people, and would love to be able to spend even more time doing it, as well as expanding into new projects like fine tuning/training. If you're able and willing to contribute it will be most gratefully received and will help me to keep providing more models, and to start work on new AI projects. Donaters will get priority support on any and all AI/LLM/model questions and requests, access to a private Discord room, plus other benefits. * Patreon: * Ko-Fi: **Special thanks to**: Aemon Algiz. **Patreon special mentions**: Michael Levine, 阿明, Trailburnt, Nikolai Manek, John Detwiler, Randy H, Will Dee, Sebastain Graf, NimbleBox.ai, Eugene Pentland, Emad Mostaque, Ai Maven, Jim Angel, Jeff Scroggin, Michael Davis, Manuel Alberto Morcote, Stephen Murray, Robert, Justin Joy, Luke @flexchar, Brandon Frisco, Elijah Stavena, S_X, Dan Guido, Undi ., Komninos Chatzipapas, Shadi, theTransient, Lone Striker, Raven Klaugh, jjj, Cap'n Zoog, Michel-Marie MAUDET (LINAGORA), Matthew Berman, David, Fen Risland, Omer Bin Jawed, Luke Pendergrass, Kalila, OG, Erik Bjäreholt, Rooh Singh, Joseph William Delisle, Dan Lewis, TL, John Villwock, AzureBlack, Brad, Pedro Madruga, Caitlyn Gatomon, K, jinyuan sun, Mano Prime, Alex, Jeffrey Morgan, Alicia Loh, Illia Dulskyi, Chadd, transmissions 11, fincy, Rainer Wilmers, ReadyPlayerEmma, knownsqashed, Mandus, biorpg, Deo Leter, Brandon Phillips, SuperWojo, Sean Connelly, Iucharbius, Jack West, Harry Royden McLaughlin, Nicholas, terasurfer, Vitor Caleffi, Duane Dunston, Johann-Peter Hartmann, David Ziegler, Olakabola, Ken Nordquist, Trenton Dambrowitz, Tom X Nguyen, Vadim, Ajan Kanaga, Leonard Tan, Clay Pascal, Alexandros Triantafyllidis, JM33133, Xule, vamX, ya boyyy, subjectnull, Talal Aujan, Alps Aficionado, wassieverse, Ari Malik, James Bentley, Woland, Spencer Kim, Michael Dempsey, Fred von Graf, Elle, zynix, William Richards, Stanislav Ovsiannikov, Edmond Seymore, Jonathan Leane, Martin Kemka, usrbinkat, Enrico Ros Thank you to all my generous patrons and donaters! And thank you again to a16z for their generous grant. <!-- footer end --> # Original model card: Mistral AI_'s Mixtral 8X7B Instruct v0.1 # Model Card for Mixtral-8x7B The Mixtral-8x7B Large Language Model (LLM) is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks we tested. For full details of this model please read our release blog post. ## Warning This repo contains weights that are compatible with vLLM serving of the model as well as Hugging Face transformers library. It is based on the original Mixtral torrent release, but the file format and parameter names are different. Please note that model cannot (yet) be instantiated with HF. ## Instruction format This format must be strictly respected, otherwise the model will generate sub-optimal outputs. The template used to build a prompt for the Instruct model is defined as follows: Note that and are special tokens for beginning of string (BOS) and end of string (EOS) while [INST] and [/INST] are regular strings. As reference, here is the pseudo-code used to tokenize instructions during fine-tuning: In the pseudo-code above, note that the method should not add a BOS or EOS token automatically, but should add a prefix space. ## Run the model By default, transformers will load the model in full precision. Therefore you might be interested to further reduce down the memory requirements to run the model through the optimizations we offer in HF ecosystem: ### In half-precision Note precision only works on GPU devices <details> <summary> Click to expand </summary> </details> ### Lower precision using (8-bit & 4-bit) using <details> <summary> Click to expand </summary> </details> ### Load the model with Flash Attention 2 <details> <summary> Click to expand </summary> </details> ## Limitations The Mixtral-8x7B Instruct model is a quick demonstration that the base model can be easily fine-tuned to achieve compelling performance. It does not have any moderation mechanisms. We're looking forward to engaging with the community on ways to make the model finely respect guardrails, allowing for deployment in environments requiring moderated outputs. # The Mistral AI Team Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Lélio Renard Lavaud, Louis Ternon, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Théophile Gervet, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.",
25
+ "model_explanation_gemini": "A quantized version of Mistral AI's Mixtral-8x7B-Instruct-v0.1 model optimized for efficient inference using AWQ (4-bit quantization), supporting multilingual text generation with a pirate-themed example demonstrating its conversational capabilities. \n\n**Features:** \n- Base model: `mistralai/Mixtral-8x7B-Instruct-v0.1` \n- Quantization: 4-bit AWQ for faster inference with comparable accuracy to GPTQ \n-"
26
+ }
model_data_json/UBC-NLP_MARBERT.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "UBC-NLP/MARBERT",
3
+ "downloads": 80475,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tf",
8
+ "jax",
9
+ "bert",
10
+ "fill-mask",
11
+ "Arabic BERT",
12
+ "MSA",
13
+ "Twitter",
14
+ "Masked Langauge Model",
15
+ "ar",
16
+ "autotrain_compatible",
17
+ "endpoints_compatible",
18
+ "region:us"
19
+ ],
20
+ "description": "--- language: - ar tags: - Arabic BERT - MSA - Twitter - Masked Langauge Model widget: - text: \"اللغة العربية هي لغة [MASK].\" --- <img src=\" alt=\"drawing\" width=\"200\" height=\"200\" align=\"right\"/> **MARBERT** is one of three models described in our **ACL 2021 paper** **\"ARBERT & MARBERT: Deep Bidirectional Transformers for Arabic\"**. MARBERT is a large-scale pre-trained masked language model focused on both Dialectal Arabic (DA) and MSA. Arabic has multiple varieties. To train MARBERT, we randomly sample 1B Arabic tweets from a large in-house dataset of about 6B tweets. We only include tweets with at least 3 Arabic words, based on character string matching, regardless whether the tweet has non-Arabic string or not. That is, we do not remove non-Arabic so long as the tweet meets the 3 Arabic word criterion. The dataset makes up **128GB of text** (**15.6B tokens**). We use the same network architecture as ARBERT (BERT-base), but without the next sentence prediction (NSP) objective since tweets are short. See our repo for modifying BERT code to remove NSP. For more information about MARBERT, please visit our own GitHub repo. # BibTex If you use our models (ARBERT, MARBERT, or MARBERTv2) for your scientific publication, or if you find the resources in this repository useful, please cite our paper as follows (to be updated): ## Acknowledgments We gratefully acknowledge support from the Natural Sciences and Engineering Research Council of Canada, the Social Sciences and Humanities Research Council of Canada, Canadian Foundation for Innovation, ComputeCanada and UBC ARC-Sockeye. We also thank the Google TensorFlow Research Cloud (TFRC) program for providing us with free TPU access.",
21
+ "model_explanation_gemini": "\"MARBERT is a large-scale Arabic BERT model pre-trained on 1B Arabic tweets for both Dialectal Arabic and Modern Standard Arabic, optimized for masked language modeling without next sentence prediction.\"\n\nModel Features: \n- Language: Arabic (Dialectal and Modern Standard) \n- Training Data: 1B Arabic tweets (128GB, 15.6B tokens) \n- Architecture: BERT-base (no NSP objective) \n- Use Case: Masked language modeling"
22
+ }
model_data_json/ZhenYe234_hubert_base_general_audio.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "ZhenYe234/hubert_base_general_audio",
3
+ "downloads": 68883,
4
+ "tags": [
5
+ "safetensors",
6
+ "hubert",
7
+ "arxiv:2408.17175",
8
+ "license:apache-2.0",
9
+ "region:us"
10
+ ],
11
+ "description": "--- license: apache-2.0 --- ![arXiv]( A hubert model trained on 200k hours general audio.",
12
+ "model_explanation_gemini": "A Hubert model trained on 200k hours of general audio for broad audio processing tasks. \n\n**Features:** \n- Pre-trained on 200k hours of general audio data. \n- Based on the Hubert architecture. \n\n**Comparison:** (Not applicable—no comparative details provided in the description.)"
13
+ }
model_data_json/aglazkova_bart_finetuned_keyphrase_extraction.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "aglazkova/bart_finetuned_keyphrase_extraction",
3
+ "downloads": 83042,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "bart",
8
+ "text2text-generation",
9
+ "en",
10
+ "dataset:midas/krapivin",
11
+ "dataset:midas/inspec",
12
+ "dataset:midas/kptimes",
13
+ "dataset:midas/duc2001",
14
+ "arxiv:1910.13461",
15
+ "arxiv:2312.10700",
16
+ "autotrain_compatible",
17
+ "endpoints_compatible",
18
+ "region:us"
19
+ ],
20
+ "description": "--- datasets: - midas/krapivin - midas/inspec - midas/kptimes - midas/duc2001 language: - en widget: - text: \"Relevance has traditionally been linked with feature subset selection, but formalization of this link has not been attempted. In this paper, we propose two axioms for feature subset selection sufficiency axiom and necessity axiombased on which this link is formalized: The expected feature subset is the one which maximizes relevance. Finding the expected feature subset turns out to be NP-hard. We then devise a heuristic algorithm to find the expected subset which has a polynomial time complexity. The experimental results show that the algorithm finds good enough subset of features which, when presented to C4.5, results in better prediction accuracy.\" - text: \"In this paper, we investigate cross-domain limitations of keyphrase generation using the models for abstractive text summarization. We present an evaluation of BART fine-tuned for keyphrase generation across three types of texts, namely scientific texts from computer science and biomedical domains and news texts. We explore the role of transfer learning between different domains to improve the model performance on small text corpora.\" --- # BART fine-tuned for keyphrase generation <!-- Provide a quick summary of what the model is/does. --> This is the <a href=\" (<a href = \" et al.. 2019</a>) model finetuned for the keyphrase generation task (<a href=\" & Morozov, 2023</a>) on the fragments of the following corpora: * Krapivin (<a href = \" et al., 2009</a>) * Inspec (<a href = \" 2003</a>) * KPTimes (<a href = \" 2019</a>) * DUC-2001 (<a href = \" 2008</a>) * PubMed (<a href = \" 2008</a>) * NamedKeys (<a href = \" & Ho, 2019</a>). #### Training Hyperparameters The following hyperparameters were used during training: * learning_rate: 4e-5 * train_batch_size: 8 * optimizer: AdamW with betas=(0.9,0.999) and epsilon=1e-08 * num_epochs: 6 **BibTeX:**",
21
+ "model_explanation_gemini": "\"Finetuned BART model for generating keyphrases from scientific and news texts, trained on multiple datasets including Krapivin, Inspec, and KPTimes.\"\n\nModel Features: \n- Task: Keyphrase generation \n- Base Model: BART \n- Training Datasets: Krapivin, Inspec, KPTimes, DUC-2001, PubMed, NamedKeys \n- Supported Language: English \n- Training Hyperparameters: \n - Learning rate: 4e"
22
+ }
model_data_json/aipicasso_emi.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "aipicasso/emi",
3
+ "downloads": 274385,
4
+ "tags": [
5
+ "diffusers",
6
+ "safetensors",
7
+ "stable-diffusion",
8
+ "text-to-image",
9
+ "arxiv:2307.01952",
10
+ "arxiv:2212.03860",
11
+ "license:openrail++",
12
+ "autotrain_compatible",
13
+ "diffusers:StableDiffusionXLPipeline",
14
+ "region:us"
15
+ ],
16
+ "description": "--- extra_gated_prompt: このモデルをこのページからダウンロードするためにはHugging Faceに登録された情報を提供する必要があります。この提供された情報は画像生成AIを活用する情報を案内するために使われます。 To download this model from this page, you need to provide information registered with Hugging Face. The information provided will be used to guide you on how to utilize the image-generation AI. license: openrail++ tags: - stable-diffusion - text-to-image inference: false library_name: diffusers --- # Emi Model Card !eyecatch.jpg Original(PNG) English: Click Here # はじめに Emi (Ethereal master of illustration) は、 最先端の開発機材H100と画像生成Stable Diffusion XL 1.0を用いて AI Picasso社が開発したAIアートに特化した画像生成AIです。 このモデルの特徴として、Danbooruなどにある無断転載画像を学習していないことがあげられます。 # ライセンスについて ライセンスについては、これまでとは違い、 CreativeML Open RAIL++-M License です。 したがって、**商用利用可能**です。 これは次のように判断したためです。 - 画像生成AIが普及するに伴い、創作業界に悪影響を及ぼさないように、マナーを守る人が増えてきたため - 他の画像生成AIが商用可能である以上、あまり非商用ライセンスである実効性がなくなってきたため # 使い方 ここからデモを利用することができます。 本格的に利用する人はここからモデルをダウンロードできます。 通常版で生成がうまく行かない場合は、安定版をお使いください。 # シンプルな作品例 !example_1.jpg !example_2.png !example_3.jpg # モデルの出力向上について - 確実にアニメ調のイラストを出したいときは、anime artwork, anime styleとプロンプトの先頭に入れてください。 - プロンプトにtransparentという言葉を入れると、より最近の画風になります。 - 全身 (full body) を描くとうまく行かない場合もあるため、そのときは安定版をお試しください。 - 使えるプロンプトはWaifu Diffusionと同じです。また、Stable Diffusionのように使うこともできます。 - ネガティブプロンプトにTextual Inversionを使用することをおすすめします。 - 手が不安定なため、DreamShaper XL1.0などの実写系モデルとのマージをおすすめします。 - ChatGPTを用いてプロンプトを洗練すると、自分の枠を超えた作品に出会えます。 - 最新のComfyUIにあるFreeUノード、またはWeb UIの拡張機能を次のパラメータで使うとさらに出力が上がる可能性があります。次の画像はFreeUを使った例です。 - b1 = 1.1, b2 = 1.2, s1 = 0.6, s2 = 0.4 report !example_4.png # 法律について 本モデルは日本にて作成されました。したがって、日本の法律が適用されます。 本モデルの学習は、著作権法第30条の4に基づき、合法であると主張します。 また、本モデルの配布については、著作権法や刑法175条に照らしてみても、 正犯や幇助犯にも該当しないと主張します。詳しくは柿沼弁護士の見解を御覧ください。 ただし、ライセンスにもある通り、本モデルの生成物は各種法令に従って取り扱って下さい。 # 連絡先 [email protected] 以下、一般的なモデルカードの日本語訳です。 ## モデル詳細 - **モデルタイプ:** 拡散モデルベースの text-to-image 生成モデル - **言語:** 日本語 - **ライセンス:** CreativeML Open RAIL++-M License - **モデルの説明:** このモデルはプロンプトに応じて適切な画像を生成することができます。アルゴリズムは Latent Diffusion Model と OpenCLIP-ViT/G、CLIP-L です。 - **補足:** - **参考文献:** ## モデルの使用例 Stable Diffusion XL 1.0と同じ使い方です。 たくさんの方法がありますが、3つのパターンを提供します。 - ComfyUI - Fooocus - Diffusers ### ComfyUIやFooocusの場合 Stable Diffusion XL 1.0 の使い方と同じく、safetensor形式のモデルファイルを使ってください。 詳しいインストール方法は、こちらの記事を参照してください。 ### Diffusersの場合 🤗's Diffusers library を使ってください。 まずは、以下のスクリプトを実行し、ライブラリをいれてください。 次のスクリ��トを実行し、画像を生成してください。 複雑な操作はデモのソースコードを参考にしてください。 #### 想定される用途 - イラストや漫画、アニメの作画補助 - 商用・非商用は問わない - 依頼の際のクリエイターとのコミュニケーション - 画像生成サービスの商用提供 - 生成物の取り扱いには注意して使ってください。 - 自己表現 - このAIを使い、「あなた」らしさを発信すること - 研究開発 - Discord上でのモデルの利用 - プロンプトエンジニアリング - ファインチューニング(追加学習とも) - DreamBooth など - 他のモデルとのマージ - 本モデルの性能をFIDなどで調べること - 本モデルがStable Diffusion以外のモデルとは独立であることをチェックサムやハッシュ関数などで調べること - 教育 - 美大生や専門学校生の卒業制作 - 大学生の卒業論文や課題制作 - 先生が画像生成AIの現状を伝えること - Hugging Face の Community にかいてある用途 - 日本語か英語で質問してください #### 想定されない用途 - 物事を事実として表現するようなこと - 先生を困らせるようなこと - その他、創作業界に悪影響を及ぼすこと # 使用してはいけない用途や悪意のある用途 - マネー・ロンダリングに用いないでください - デジタル贋作 (Digital Forgery) は公開しないでください(著作権法に違反するおそれ) - 他人の作品を無断でImage-to-Imageしないでください(著作権法に違反するおそれ) - わいせつ物を頒布しないでください (刑法175条に違反するおそれ) - いわゆる業界のマナーを守らないようなこと - 事実に基づかないことを事実のように語らないようにしてください(威力業務妨害罪が適用されるおそれ) - フェイクニュース ## モデルの限界やバイアス ### モデルの限界 - 拡散モデルや大規模言語モデルは、いまだに未知の部分が多く、その限界は判明していない。 ### バイアス - 拡散モデルや大規模言語モデルは、いまだに未知の部分が多く、バイアスは判明していない。 ## 学習 **学習データ** - Stable Diffusionと同様のデータセットからDanbooruの無断転載画像を取り除いて手動で集めた約2000枚の画像 - Stable Diffusionと同様のデータセットからDanbooruの無断転載画像を取り除いて自動で集めた約50万枚の画像 **学習プロセス** - **ハードウェア:** H100 ## 評価結果 第三者による評価を求めています。 ## 環境への影響 - **ハードウェアタイプ:** H100 - **使用時間(単位は時間):** 500 - **学習した場所:** 日本 ## 参考文献",
17
+ "model_explanation_gemini": "Generates anime-style illustrations using Stable Diffusion XL 1.0, avoiding unauthorized Danbooru images, and allows commercial use under the Open RAIL++-M license.\n\n**Features:** \n- Text-to-image diffusion model (Stable Diffusion XL-based) \n- Specialized for anime artwork generation \n- Commercial use permitted \n- Optimized prompts (e.g., \"anime style\", \"transparent\") \n- Compatible with Waifu Diffusion/Stable Diffusion prompts \n-"
18
+ }
model_data_json/allegro_herbert-base-cased.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "allegro/herbert-base-cased",
3
+ "downloads": 78442,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tf",
8
+ "jax",
9
+ "bert",
10
+ "feature-extraction",
11
+ "herbert",
12
+ "pl",
13
+ "license:cc-by-4.0",
14
+ "endpoints_compatible",
15
+ "region:us"
16
+ ],
17
+ "description": "--- language: pl tags: - herbert license: cc-by-4.0 --- # HerBERT **HerBERT** is a BERT-based Language Model trained on Polish corpora using Masked Language Modelling (MLM) and Sentence Structural Objective (SSO) with dynamic masking of whole words. For more details, please refer to: HerBERT: Efficiently Pretrained Transformer-based Language Model for Polish. Model training and experiments were conducted with transformers in version 2.9. ## Corpus HerBERT was trained on six different corpora available for Polish language: | Corpus | Tokens | Documents | | :------ | ------: | ------: | | CCNet Middle | 3243M | 7.9M | | CCNet Head | 2641M | 7.0M | | National Corpus of Polish| 1357M | 3.9M | | Open Subtitles | 1056M | 1.1M | Wikipedia | 260M | 1.4M | | Wolne Lektury | 41M | 5.5k | ## Tokenizer The training dataset was tokenized into subwords using a character level byte-pair encoding (``. ## Usage Example code: ## License CC BY 4.0 ## Citation If you use this model, please cite the following paper: ## Authors The model was trained by **Machine Learning Research Team at Allegro** and **Linguistic Engineering Group at Institute of Computer Science, Polish Academy of Sciences**. You can contact us at: <a href=\"mailto:[email protected]\">[email protected]</a>",
18
+ "model_explanation_gemini": "\"HerBERT is a BERT-based Polish language model trained with MLM and SSO objectives on diverse Polish corpora for natural language processing tasks.\"\n\nModel Features: \n- BERT-based architecture \n- Trained on Polish corpora (CCNet, National Corpus of Polish, Open Subtitles, Wikipedia, Wolne Lektury) \n- Uses Masked Language Modeling (MLM) and Sentence Structural Objective (SSO) \n- Implements dynamic whole-word masking \n- Byte-pair"
19
+ }
model_data_json/answerdotai_ModernBERT-large.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "answerdotai/ModernBERT-large",
3
+ "downloads": 80863,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "onnx",
8
+ "safetensors",
9
+ "modernbert",
10
+ "fill-mask",
11
+ "masked-lm",
12
+ "long-context",
13
+ "en",
14
+ "arxiv:2412.13663",
15
+ "license:apache-2.0",
16
+ "autotrain_compatible",
17
+ "region:us"
18
+ ],
19
+ "description": "--- library_name: transformers license: apache-2.0 language: - en tags: - fill-mask - masked-lm - long-context - modernbert pipeline_tag: fill-mask inference: false --- # ModernBERT ## Table of Contents 1. Model Summary 2. Usage 3. Evaluation 4. Limitations 5. Training 6. License 7. Citation ## Model Summary ModernBERT is a modernized bidirectional encoder-only Transformer model (BERT-style) pre-trained on 2 trillion tokens of English and code data with a native context length of up to 8,192 tokens. ModernBERT leverages recent architectural improvements such as: - **Rotary Positional Embeddings (RoPE)** for long-context support. - **Local-Global Alternating Attention** for efficiency on long inputs. - **Unpadding and Flash Attention** for efficient inference. ModernBERT’s native long context length makes it ideal for tasks that require processing long documents, such as retrieval, classification, and semantic search within large corpora. The model was trained on a large corpus of text and code, making it suitable for a wide range of downstream tasks, including code retrieval and hybrid (text + code) semantic search. It is available in the following sizes: - ModernBERT-base - 22 layers, 149 million parameters - ModernBERT-large - 28 layers, 395 million parameters For more information about ModernBERT, we recommend our release blog post for a high-level overview, and our arXiv pre-print for in-depth information. *ModernBERT is a collaboration between Answer.AI, LightOn, and friends.* ## Usage You can use these models directly with the library starting from v4.48.0: Since ModernBERT is a Masked Language Model (MLM), you can use the pipeline or load it via . To use ModernBERT for downstream tasks like classification, retrieval, or QA, fine-tune it following standard BERT fine-tuning recipes. **⚠️ If your GPU supports it, we recommend using ModernBERT with Flash Attention 2 to reach the highest efficiency. To do so, install Flash Attention as follows, then use the model as normal:** Using : Using a pipeline: **Note:** ModernBERT does not use token type IDs, unlike some earlier BERT models. Most downstream usage is identical to standard BERT models on the Hugging Face Hub, except you can omit the parameter. ## Evaluation We evaluate ModernBERT across a range of tasks, including natural language understanding (GLUE), general retrieval (BEIR), long-context retrieval (MLDR), and code retrieval (CodeSearchNet and StackQA). **Key highlights:** - On GLUE, ModernBERT-base surpasses other similarly-sized encoder models, and ModernBERT-large is second only to Deberta-v3-large. - For general retrieval tasks, ModernBERT performs well on BEIR in both single-vector (DPR-style) and multi-vector (ColBERT-style) settings. - Thanks to the inclusion of code data in its training mixture, ModernBERT as a backbone also achieves new state-of-the-art code retrieval results on CodeSearchNet and StackQA. ### Base Models | Model | IR (DPR) | IR (DPR) | IR (DPR) | IR (ColBERT) | IR (ColBERT) | NLU | Code | Code | |-------------|--------------|--------------|--------------|---------------|---------------|------|------|------| | | BEIR | MLDR_OOD | MLDR_ID | BEIR | MLDR_OOD | GLUE | CSN | SQA | | BERT | 38.9 | 23.9 | 32.2 | 49.0 | 28.1 | 84.7 | 41.2 | 59.5 | | RoBERTa | 37.7 | 22.9 | 32.8 | 48.7 | 28.2 | 86.4 | 44.3 | 59.6 | | DeBERTaV3 | 20.2 | 5.4 | 13.4 | 47.1 | 21.9 | 88.1 | 17.5 | 18.6 | | NomicBERT | 41.0 | 26.7 | 30.3 | 49.9 | 61.3 | 84.0 | 41.6 | 61.4 | | GTE-en-MLM | 41.4 | **34.3** |**44.4** | 48.2 | 69.3 | 85.6 | 44.9 | 71.4 | | ModernBERT | **41.6** | 27.4 | 44.0 | **51.3** | **80.2** | **88.4** | **56.4** |**73.6**| --- ### Large Models | Model | IR (DPR) | IR (DPR) | IR (DPR) | IR (ColBERT) | IR (ColBERT) | NLU | Code | Code | |-------------|--------------|--------------|--------------|---------------|---------------|------|------|------| | | BEIR | MLDR_OOD | MLDR_ID | BEIR | MLDR_OOD | GLUE | CSN | SQA | | BERT | 38.9 | 23.3 | 31.7 | 49.5 | 28.5 | 85.2 | 41.6 | 60.8 | | RoBERTa | 41.4 | 22.6 | 36.1 | 49.8 | 28.8 | 88.9 | 47.3 | 68.1 | | DeBERTaV3 | 25.6 | 7.1 | 19.2 | 46.7 | 23.0 | **91.4**| 21.2 | 19.7 | | GTE-en-MLM | 42.5 | **36.4** | **48.9** | 50.7 | 71.3 | 87.6 | 40.5 | 66.9 | | ModernBERT | **44.0** | 34.3 | 48.6 | **52.4** | **80.4** | 90.4 |**59.5** |**83.9**| *Table 1: Results for all models across an overview of all tasks. CSN refers to CodeSearchNet and SQA to StackQA. MLDRID refers to in-domain (fine-tuned on the training set) evaluation, and MLDR_OOD to out-of-domain.* ModernBERT’s strong results, coupled with its efficient runtime on long-context inputs, demonstrate that encoder-only models can be significantly improved through modern architectural choices and extensive pretraining on diversified data sources. ## Limitations ModernBERT’s training data is primarily English and code, so performance may be lower for other languages. While it can handle long sequences efficiently, using the full 8,192 tokens window may be slower than short-context inference. Like any large language model, ModernBERT may produce representations that reflect biases present in its training data. Verify critical or sensitive outputs before relying on them. ## Training - Architecture: Encoder-only, Pre-Norm Transformer with GeGLU activations. - Sequence Length: Pre-trained up to 1,024 tokens, then extended to 8,192 tokens. - Data: 2 trillion tokens of English text and code. - Optimizer: StableAdamW with trapezoidal LR scheduling and 1-sqrt decay. - Hardware: Trained on 8x H100 GPUs. See the paper for more details. ## License We release the ModernBERT model architectures, model weights, training codebase under the Apache 2.0 license. ## Citation If you use ModernBERT in your work, please cite:",
20
+ "model_explanation_gemini": "\"ModernBERT-large is a BERT-style model optimized for long-context tasks (up to 8,192 tokens) with architectural improvements like RoPE and Flash Attention, excelling in retrieval, NLU, and code-related tasks.\"\n\n**Features:** \n- **Architecture:** Rotary Positional Embeddings (RoPE), Local-Global Alternating Attention, Unpadding, Flash Attention support. \n- **Context Length:** Native 8,192-token handling. \n- **Training"
21
+ }
model_data_json/ashawkey_mvdream-sd2.1-diffusers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "ashawkey/mvdream-sd2.1-diffusers",
3
+ "downloads": 78372,
4
+ "tags": [
5
+ "diffusers",
6
+ "safetensors",
7
+ "arxiv:2308.16512",
8
+ "license:openrail",
9
+ "diffusers:MVDreamPipeline",
10
+ "region:us"
11
+ ],
12
+ "description": "--- license: openrail --- # MVDream-diffusers Model Card This is a port of into diffusers. For usage, please check: ## Citation ## Misuse, Malicious Use, and Out-of-Scope Use The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.",
13
+ "model_explanation_gemini": "Port of MVDream-diffusers for generating multi-view images with Stable Diffusion 2.1.\n\nFeatures: \n- Multi-view image generation \n- Based on Stable Diffusion 2.1 \n- Diffusers integration \n\nComparison: Not applicable (no comparative details provided in the description)."
14
+ }
model_data_json/aubmindlab_bert-base-arabert.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "aubmindlab/bert-base-arabert",
3
+ "downloads": 74124,
4
+ "tags": [
5
+ "transformers",
6
+ "pytorch",
7
+ "tf",
8
+ "jax",
9
+ "safetensors",
10
+ "bert",
11
+ "fill-mask",
12
+ "ar",
13
+ "arxiv:2003.00104",
14
+ "autotrain_compatible",
15
+ "endpoints_compatible",
16
+ "region:us"
17
+ ],
18
+ "description": "--- language: ar datasets: - wikipedia - Osian - 1.5B-Arabic-Corpus - oscar-arabic-unshuffled - Assafir(private) widget: - text: \" عاصم +ة لبنان هي [MASK] .\" --- # !!! A newer version of this model is available !!! AraBERTv2 # AraBERT v1 & v2 : Pre-training BERT for Arabic Language Understanding <img src=\" width=\"100\" align=\"left\"/> **AraBERT** is an Arabic pretrained lanaguage model based on Google's BERT architechture. AraBERT uses the same BERT-Base config. More details are available in the AraBERT Paper and in the AraBERT Meetup There are two versions of the model, AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the Farasa Segmenter. We evalaute AraBERT models on different downstream tasks and compare them to mBERT), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets (HARD, ASTD-Balanced, ArsenTD-Lev, LABR), Named Entity Recognition with the ANERcorp, and Arabic Question Answering on Arabic-SQuAD and ARCD # AraBERTv2 ## What's New! AraBERT now comes in 4 new variants to replace the old v1 versions: More Detail in the AraBERT folder and in the README and in the AraBERT Paper Model | HuggingFace Model Name | Size (MB/Params)| Pre-Segmentation | DataSet (Sentences/Size/nWords) | ---|:---:|:---:|:---:|:---: AraBERTv0.2-base | bert-base-arabertv02 | 543MB / 136M | No | 200M / 77GB / 8.6B | AraBERTv0.2-large| bert-large-arabertv02 | 1.38G 371M | No | 200M / 77GB / 8.6B | AraBERTv2-base| bert-base-arabertv2 | 543MB 136M | Yes | 200M / 77GB / 8.6B | AraBERTv2-large| bert-large-arabertv2 | 1.38G 371M | Yes | 200M / 77GB / 8.6B | AraBERTv0.1-base| bert-base-arabertv01 | 543MB 136M | No | 77M / 23GB / 2.7B | AraBERTv1-base| bert-base-arabert | 543MB 136M | Yes | 77M / 23GB / 2.7B | All models are available in the model page under the aubmindlab name. Checkpoints are available in PyTorch, TF2 and TF1 formats. ## Better Pre-Processing and New Vocab We identified an issue with AraBERTv1's wordpiece vocabulary. The issue came from punctuations and numbers that were still attached to words when learned the wordpiece vocab. We now insert a space between numbers and characters and around punctuation characters. The new vocabulary was learnt using the from the library, and should now support the Fast tokenizer implementation from the library. **P.S.**: All the old BERT codes should work with the new BERT, just change the model name and check the new preprocessing dunction **Please read the section on how to use the preprocessing function** ## Bigger Dataset and More Compute We used ~3.5 times more data, and trained for longer. For Dataset Sources see the Dataset Section Model | Hardware | num of examples with seq len (128 / 512) |128 (Batch Size/ Num of Steps) | 512 (Batch Size/ Num of Steps) | Total Steps | Total Time (in Days) | ---|:---:|:---:|:---:|:---:|:---:|:---: AraBERTv0.2-base | TPUv3-8 | 420M / 207M |2560 / 1M | 384/ 2M | 3M | - AraBERTv0.2-large | TPUv3-128 | 420M / 207M | 13440 / 250K | 2056 / 300K | 550K | - AraBERTv2-base | TPUv3-8 | 520M / 245M |13440 / 250K | 2056 / 300K | 550K | - AraBERTv2-large | TPUv3-128 | 520M / 245M | 13440 / 250K | 2056 / 300K | 550K | - AraBERT-base (v1/v0.1) | TPUv2-8 | - |512 / 900K | 128 / 300K| 1.2M | 4 days # Dataset The pretraining data used for the new AraBERT model is also used for Arabic **GPT2 and ELECTRA**. The dataset consists of 77GB or 200,095,961 lines or 8,655,948,860 words or 82,232,988,358 chars (before applying Farasa Segmentation) For the new dataset we added the unshuffled OSCAR corpus, after we thoroughly filter it, to the previous dataset used in AraBERTv1 but with out the websites that we previously crawled: - OSCAR unshuffled and filtered. - Arabic Wikipedia dump from 2020/09/01 - The 1.5B words Arabic Corpus - The OSIAN Corpus - Assafir news articles. Huge thank you for Assafir for giving us the data # Preprocessing It is recommended to apply our preprocessing function before training/testing on any dataset. **Install farasapy to segment text for AraBERT v1 & v2 ** ## Accepted_models # TensorFlow 1.x models The TF1.x model are available in the HuggingFace models repo. You can download them as follows: - via git-lfs: clone all the models in a repo where is any model under the name - via : - Go to the tf1_model.tar.gz file on huggingface.co/models/aubmindlab/MODEL_NAME. - copy the - then run (ex: for : ) # If you used this model please cite us as : Google Scholar has our Bibtex wrong (missing name), use this instead # Acknowledgments Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the AUB MIND Lab Members for the continous support. Also thanks to Yakshof and Assafir for data and storage access. Another thanks for Habib Rahal ( for putting a face to AraBERT. ## Contacts **Wissam Antoun**: Linkedin | Twitter | Github | <[email protected]> | <[email protected]> **Fady Baly**: Linkedin | Twitter | Github | <[email protected]> | <[email protected]>",
19
+ "model_explanation_gemini": "Pretrained BERT-based model for Arabic language understanding, optimized for tasks like sentiment analysis, named entity recognition, and question answering.\n\nFeatures: \n- Based on BERT-Base architecture \n- Trained on large Arabic datasets (Wikipedia, OSCAR, 1.5B-Arabic-Corpus, etc.) \n- Supports pre-segmented text (Farasa Segmenter in v1/v2) \n- Improved vocabulary handling (v2) \n- Available in base/large"
20
+ }
model_data_json/autogluon_chronos-bolt-tiny.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "autogluon/chronos-bolt-tiny",
3
+ "downloads": 69867,
4
+ "tags": [
5
+ "safetensors",
6
+ "t5",
7
+ "time series",
8
+ "forecasting",
9
+ "pretrained models",
10
+ "foundation models",
11
+ "time series foundation models",
12
+ "time-series",
13
+ "time-series-forecasting",
14
+ "arxiv:1910.10683",
15
+ "arxiv:2403.07815",
16
+ "license:apache-2.0",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: apache-2.0 pipeline_tag: time-series-forecasting tags: - time series - forecasting - pretrained models - foundation models - time series foundation models - time-series --- # Chronos-Bolt⚡ (Tiny) 🚀 **Update Feb 14, 2025**: Chronos-Bolt models are now available on Amazon SageMaker JumpStart! Check out the tutorial notebook to learn how to deploy Chronos endpoints for production use in a few lines of code. Chronos-Bolt is a family of pretrained time series forecasting models which can be used for zero-shot forecasting. It is based on the T5 encoder-decoder architecture and has been trained on nearly 100 billion time series observations. It chunks the historical time series context into patches of multiple observations, which are then input into the encoder. The decoder then uses these representations to directly generate quantile forecasts across multiple future steps—a method known as direct multi-step forecasting. Chronos-Bolt models are **more accurate**, up to **250 times faster** and **20 times more memory-efficient** than the original Chronos models of the same size. ## Performance The following plot compares the inference time of Chronos-Bolt against the original Chronos models for forecasting 1024 time series with a context length of 512 observations and a prediction horizon of 64 steps. <center> <img src=\" width=\"50%\"/> </center> Chronos-Bolt models are not only significantly faster but also more accurate than the original Chronos models. The following plot reports the probabilistic and point forecasting performance of Chronos-Bolt in terms of the Weighted Quantile Loss (WQL) and the Mean Absolute Scaled Error (MASE), respectively, aggregated over 27 datasets (see the Chronos paper for details on this benchmark). Remarkably, despite having no prior exposure to these datasets during training, the zero-shot Chronos-Bolt models outperform commonly used statistical models and deep learning models that have been trained on these datasets (highlighted by *). Furthermore, they also perform better than other FMs, denoted by a +, which indicates that these models were pretrained on certain datasets in our benchmark and are not entirely zero-shot. Notably, Chronos-Bolt (Base) also surpasses the original Chronos (Large) model in terms of the forecasting accuracy while being over 600 times faster. <center> <img src=\" width=\"80%\"/> </center> Chronos-Bolt models are available in the following sizes. <div align=\"center\"> | Model | Parameters | Based on | | ---------------------------------------------------------------------- | ---------- | ---------------------------------------------------------------------- | | **chronos-bolt-tiny** | 9M | t5-efficient-tiny | | **chronos-bolt-mini** | 21M | t5-efficient-mini | | **chronos-bolt-small** | 48M | t5-efficient-small | | **chronos-bolt-base** | 205M | t5-efficient-base | </div> ## Usage ### Zero-shot inference with Chronos-Bolt in AutoGluon Install the required dependencies. Forecast with the Chronos-Bolt model. For more advanced features such as **fine-tuning** and **forecasting with covariates**, check out this tutorial. ### Deploying a Chronos-Bolt endpoint to SageMaker First, update the SageMaker SDK to make sure that all the latest models are available. Deploy an inference endpoint to SageMaker. Now you can send time series data to the endpoint in JSON format. Chronos-Bolt models can be deployed to both CPU and GPU instances. These models also support **forecasting with covariates**. For more details about the endpoint API, check out the example notebook. ## Citation If you find Chronos or Chronos-Bolt models useful for your research, please consider citing the associated paper: ## License This project is licensed under the Apache-2.0 License.",
20
+ "model_explanation_gemini": "\"Pretrained time series forecasting model based on T5 architecture, offering zero-shot forecasting with improved accuracy, speed, and memory efficiency compared to original Chronos models.\"\n\nModel Features: \n- Zero-shot time series forecasting capability \n- T5 encoder-decoder architecture \n- Processes historical data in patched observations \n- Direct multi-step quantile forecasting \n- Pretrained on ~100B time series observations \n- Available in tiny/mini/small/base sizes \n\nComparison: \nChronos-Bolt"
21
+ }
model_data_json/bartowski_Llama-3.2-3B-Instruct-GGUF.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "bartowski/Llama-3.2-3B-Instruct-GGUF",
3
+ "downloads": 74797,
4
+ "tags": [
5
+ "gguf",
6
+ "facebook",
7
+ "meta",
8
+ "llama",
9
+ "llama-3",
10
+ "text-generation",
11
+ "en",
12
+ "de",
13
+ "fr",
14
+ "it",
15
+ "pt",
16
+ "hi",
17
+ "es",
18
+ "th",
19
+ "base_model:meta-llama/Llama-3.2-3B-Instruct",
20
+ "base_model:quantized:meta-llama/Llama-3.2-3B-Instruct",
21
+ "license:llama3.2",
22
+ "endpoints_compatible",
23
+ "region:us",
24
+ "conversational"
25
+ ],
26
+ "description": "--- base_model: meta-llama/Llama-3.2-3B-Instruct language: - en - de - fr - it - pt - hi - es - th license: llama3.2 pipeline_tag: text-generation tags: - facebook - meta - llama - llama-3 quantized_by: bartowski extra_gated_prompt: \"### LLAMA 3.2 COMMUNITY LICENSE AGREEMENT\\n\\nLlama 3.2 Version\\ \\ Release Date: September 25, 2024\\n\\n“Agreement” means the terms and conditions\\ \\ for use, reproduction, distribution and modification of the Llama Materials set\\ \\ forth herein.\\n\\n“Documentation” means the specifications, manuals and documentation\\ \\ accompanying Llama 3.2 distributed by Meta at \\n“Licensee” or “you” means you, or your employer or any other person or entity\\ \\ (if you are entering into this Agreement on such person or entity’s behalf),\\ \\ of the age required under applicable laws, rules or regulations to provide legal\\ \\ consent and that has legal authority to bind your employer or such other person\\ \\ or entity if you are entering in this Agreement on their behalf.\\n\\n“Llama 3.2”\\ \\ means the foundational large language models and software and algorithms, including\\ \\ machine-learning model code, trained model weights, inference-enabling code, training-enabling\\ \\ code, fine-tuning enabling code and other elements of the foregoing distributed\\ \\ by Meta at Materials” means,\\ \\ collectively, Meta’s proprietary Llama 3.2 and Documentation (and any portion\\ \\ thereof) made available under this Agreement.\\n\\n“Meta” or “we” means Meta Platforms\\ \\ Ireland Limited (if you are located in or, if you are an entity, your principal\\ \\ place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if\\ \\ you are located outside of the EEA or Switzerland). \\n\\nBy clicking “I Accept”\\ \\ below or by using or distributing any portion or element of the Llama Materials,\\ \\ you agree to be bound by this Agreement.\\n\\n1. License Rights and Redistribution.\\n\\ a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable\\ \\ and royalty-free limited license under Meta’s intellectual property or other rights\\ \\ owned by Meta embodied in the Llama Materials to use, reproduce, distribute,\\ \\ copy, create derivative works of, and make modifications to the Llama Materials.\\ \\ \\nb. Redistribution and Use. \\ni. If you distribute or make available the Llama\\ \\ Materials (or any derivative works thereof), or a product or service (including\\ \\ another AI model) that contains any of them, you shall (A) provide a copy of this\\ \\ Agreement with any such Llama Materials; and (B) prominently display “Built with\\ \\ Llama” on a related website, user interface, blogpost, about page, or product\\ \\ documentation. If you use the Llama Materials or any outputs or results of the\\ \\ Llama Materials to create, train, fine tune, or otherwise improve an AI model,\\ \\ which is distributed or made available, you shall also include “Llama” at the\\ \\ beginning of any such AI model name.\\nii. If you receive Llama Materials, or any\\ \\ derivative works thereof, from a Licensee as part of an integrated end user product,\\ \\ then Section 2 of this Agreement will not apply to you. \\niii. You must retain\\ \\ in all copies of the Llama Materials that you distribute the following attribution\\ \\ notice within a “Notice” text file distributed as a part of such copies: “Llama\\ \\ 3.2 is licensed under the Llama 3.2 Community License, Copyright © Meta Platforms,\\ \\ Inc. All Rights Reserved.”\\niv. Your use of the Llama Materials must comply with\\ \\ applicable laws and regulations (including trade compliance laws and regulations)\\ \\ and adhere to the Acceptable Use Policy for the Llama Materials (available at\\ \\ which is hereby incorporated by reference\\ \\ into this Agreement.\\n \\n2. Additional Commercial Terms. If, on the Llama 3.2\\ \\ version release date, the monthly active users of the products or services made\\ \\ available by or for Licensee, or Licensee’s affiliates, is greater than 700 million\\ \\ monthly active users in the preceding calendar month, you must request a license\\ \\ from Meta, which Meta may grant to you in its sole discretion, and you are not\\ \\ authorized to exercise any of the rights under this Agreement unless or until\\ \\ Meta otherwise expressly grants you such rights.\\n3. Disclaimer of Warranty. UNLESS\\ \\ REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM\\ \\ ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS\\ \\ ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION,\\ \\ ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR\\ \\ PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING\\ \\ OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR\\ \\ USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.\\n4. Limitation of Liability.\\ \\ IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY,\\ \\ WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING\\ \\ OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL,\\ \\ INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE\\ \\ BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.\\n5. Intellectual Property.\\n\\ a. No trademark licenses are granted under this Agreement, and in connection with\\ \\ the Llama Materials, neither Meta nor Licensee may use any name or mark owned\\ \\ by or associated with the other or any of its affiliates, except as required\\ \\ for reasonable and customary use in describing and redistributing the Llama Materials\\ \\ or as set forth in this Section 5(a). Meta hereby grants you a license to use\\ \\ “Llama” (the “Mark”) solely as required to comply with the last sentence of Section\\ \\ 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at\\ \\ All goodwill arising\\ \\ out of your use of the Mark will inure to the benefit of Meta.\\nb. Subject to\\ \\ Meta’s ownership of Llama Materials and derivatives made by or for Meta, with\\ \\ respect to any derivative works and modifications of the Llama Materials that\\ \\ are made by you, as between you and Meta, you are and will be the owner of such\\ \\ derivative works and modifications.\\nc. If you institute litigation or other proceedings\\ \\ against Meta or any entity (including a cross-claim or counterclaim in a lawsuit)\\ \\ alleging that the Llama Materials or Llama 3.2 outputs or results, or any portion\\ \\ of any of the foregoing, constitutes infringement of intellectual property or\\ \\ other rights owned or licensable by you, then any licenses granted to you under\\ \\ this Agreement shall terminate as of the date such litigation or claim is filed\\ \\ or instituted. You will indemnify and hold harmless Meta from and against any\\ \\ claim by any third party arising out of or related to your use or distribution\\ \\ of the Llama Materials.\\n6. Term and Termination. The term of this Agreement will\\ \\ commence upon your acceptance of this Agreement or access to the Llama Materials\\ \\ and will continue in full force and effect until terminated in accordance with\\ \\ the terms and conditions herein. Meta may terminate this Agreement if you are\\ \\ in breach of any term or condition of this Agreement. Upon termination of this\\ \\ Agreement, you shall delete and cease use of the Llama Materials. Sections 3,\\ \\ 4 and 7 shall survive the termination of this Agreement. \\n7. Governing Law and\\ \\ Jurisdiction. This Agreement will be governed and construed under the laws of\\ \\ the State of California without regard to choice of law principles, and the UN\\ \\ Convention on Contracts for the International Sale of Goods does not apply to\\ \\ this Agreement. The courts of California shall have exclusive jurisdiction of\\ \\ any dispute arising out of this Agreement. \\n### Llama 3.2 Acceptable Use Policy\\n\\ Meta is committed to promoting safe and fair use of its tools and features, including\\ \\ Llama 3.2. If you access or use Llama 3.2, you agree to this Acceptable Use Policy\\ \\ (“**Policy**”). The most recent copy of this policy can be found at #### Prohibited Uses\\nWe want everyone to use Llama 3.2 safely and responsibly.\\ \\ You agree you will not use, or allow others to use, Llama 3.2 to:\\n1. Violate\\ \\ the law or others’ rights, including to:\\n 1. Engage in, promote, generate,\\ \\ contribute to, encourage, plan, incite, or further illegal or unlawful activity\\ \\ or content, such as:\\n 1. Violence or terrorism\\n 2. Exploitation\\ \\ or harm to children, including the solicitation, creation, acquisition, or dissemination\\ \\ of child exploitative content or failure to report Child Sexual Abuse Material\\n\\ \\ 3. Human trafficking, exploitation, and sexual violence\\n 4. The\\ \\ illegal distribution of information or materials to minors, including obscene\\ \\ materials, or failure to employ legally required age-gating in connection with\\ \\ such information or materials.\\n 5. Sexual solicitation\\n 6. Any\\ \\ other criminal activity\\n 1. Engage in, promote, incite, or facilitate the\\ \\ harassment, abuse, threatening, or bullying of individuals or groups of individuals\\n\\ \\ 2. Engage in, promote, incite, or facilitate discrimination or other unlawful\\ \\ or harmful conduct in the provision of employment, employment benefits, credit,\\ \\ housing, other economic benefits, or other essential goods and services\\n 3.\\ \\ Engage in the unauthorized or unlicensed practice of any profession including,\\ \\ but not limited to, financial, legal, medical/health, or related professional\\ \\ practices\\n 4. Collect, process, disclose, generate, or infer private or sensitive\\ \\ information about individuals, including information about individuals’ identity,\\ \\ health, or demographic information, unless you have obtained the right to do so\\ \\ in accordance with applicable law\\n 5. Engage in or facilitate any action or\\ \\ generate any content that infringes, misappropriates, or otherwise violates any\\ \\ third-party rights, including the outputs or results of any products or services\\ \\ using the Llama Materials\\n 6. Create, generate, or facilitate the creation\\ \\ of malicious code, malware, computer viruses or do anything else that could disable,\\ \\ overburden, interfere with or impair the proper working, integrity, operation\\ \\ or appearance of a website or computer system\\n 7. Engage in any action, or\\ \\ facilitate any action, to intentionally circumvent or remove usage restrictions\\ \\ or other safety measures, or to enable functionality disabled by Meta \\n2. Engage\\ \\ in, promote, incite, facilitate, or assist in the planning or development of activities\\ \\ that present a risk of death or bodily harm to individuals, including use of Llama\\ \\ 3.2 related to the following:\\n 8. Military, warfare, nuclear industries or\\ \\ applications, espionage, use for materials or activities that are subject to the\\ \\ International Traffic Arms Regulations (ITAR) maintained by the United States\\ \\ Department of State or to the U.S. Biological Weapons Anti-Terrorism Act of 1989\\ \\ or the Chemical Weapons Convention Implementation Act of 1997\\n 9. Guns and\\ \\ illegal weapons (including weapon development)\\n 10. Illegal drugs and regulated/controlled\\ \\ substances\\n 11. Operation of critical infrastructure, transportation technologies,\\ \\ or heavy machinery\\n 12. Self-harm or harm to others, including suicide, cutting,\\ \\ and eating disorders\\n 13. Any content intended to incite or promote violence,\\ \\ abuse, or any infliction of bodily harm to an individual\\n3. Intentionally deceive\\ \\ or mislead others, including use of Llama 3.2 related to the following:\\n 14.\\ \\ Generating, promoting, or furthering fraud or the creation or promotion of disinformation\\n\\ \\ 15. Generating, promoting, or furthering defamatory content, including the\\ \\ creation of defamatory statements, images, or other content\\n 16. Generating,\\ \\ promoting, or further distributing spam\\n 17. Impersonating another individual\\ \\ without consent, authorization, or legal right\\n 18. Representing that the\\ \\ use of Llama 3.2 or outputs are human-generated\\n 19. Generating or facilitating\\ \\ false online engagement, including fake reviews and other means of fake online\\ \\ engagement \\n4. Fail to appropriately disclose to end users any known dangers\\ \\ of your AI system 5. Interact with third party tools, models, or software designed\\ \\ to generate unlawful content or engage in unlawful or harmful conduct and/or represent\\ \\ that the outputs of such tools, models, or software are associated with Meta or\\ \\ Llama 3.2\\n\\nWith respect to any multimodal models included in Llama 3.2, the\\ \\ rights granted under Section 1(a) of the Llama 3.2 Community License Agreement\\ \\ are not being granted to you if you are an individual domiciled in, or a company\\ \\ with a principal place of business in, the European Union. This restriction does\\ \\ not apply to end users of a product or service that incorporates any such multimodal\\ \\ models.\\n\\nPlease report any violation of this Policy, software “bug,” or other\\ \\ problems that could lead to a violation of this Policy through one of the following\\ \\ means:\\n\\n* Reporting issues with the model: * Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback\\n\\ * Reporting bugs and security concerns: facebook.com/whitehat/info\\n\\ * Reporting violations of the Acceptable Use Policy or unlicensed uses of Llama\\ \\ 3.2: [email protected]\" extra_gated_fields: First Name: text Last Name: text Date of birth: date_picker Country: country Affiliation: text Job title: type: select options: - Student - Research Graduate - AI researcher - AI developer/engineer - Reporter - Other geo: ip_location ? By clicking Submit below I accept the terms of the license and acknowledge that the information I provide will be collected stored processed and shared in accordance with the Meta Privacy Policy : checkbox extra_gated_description: The information you provide will be collected, stored, processed and shared in accordance with the Meta Privacy Policy. extra_gated_button_content: Submit --- ## Llamacpp imatrix Quantizations of Llama-3.2-3B-Instruct Using <a href=\" release <a href=\" for quantization. Original model: All quants made using imatrix option with dataset from here Run them in LM Studio ## Prompt format ## Download a file (not the whole branch) from below: | Filename | Quant type | File Size | Split | Description | | -------- | ---------- | --------- | ----- | ----------- | | Llama-3.2-3B-Instruct-f16.gguf | f16 | 6.43GB | false | Full F16 weights. | | Llama-3.2-3B-Instruct-Q8_0.gguf | Q8_0 | 3.42GB | false | Extremely high quality, generally unneeded but max available quant. | | Llama-3.2-3B-Instruct-Q6_K_L.gguf | Q6_K_L | 2.74GB | false | Uses Q8_0 for embed and output weights. Very high quality, near perfect, *recommended*. | | Llama-3.2-3B-Instruct-Q6_K.gguf | Q6_K | 2.64GB | false | Very high quality, near perfect, *recommended*. | | Llama-3.2-3B-Instruct-Q5_K_L.gguf | Q5_K_L | 2.42GB | false | Uses Q8_0 for embed and output weights. High quality, *recommended*. | | Llama-3.2-3B-Instruct-Q5_K_M.gguf | Q5_K_M | 2.32GB | false | High quality, *recommended*. | | Llama-3.2-3B-Instruct-Q5_K_S.gguf | Q5_K_S | 2.27GB | false | High quality, *recommended*. | | Llama-3.2-3B-Instruct-Q4_K_L.gguf | Q4_K_L | 2.11GB | false | Uses Q8_0 for embed and output weights. Good quality, *recommended*. | | Llama-3.2-3B-Instruct-Q4_K_M.gguf | Q4_K_M | 2.02GB | false | Good quality, default size for must use cases, *recommended*. | | Llama-3.2-3B-Instruct-Q4_K_S.gguf | Q4_K_S | 1.93GB | false | Slightly lower quality with more space savings, *recommended*. | | Llama-3.2-3B-Instruct-Q4_0_8_8.gguf | Q4_0_8_8 | 1.92GB | false | Optimized for ARM inference. Requires 'sve' support (see link below). | | Llama-3.2-3B-Instruct-Q4_0_4_8.gguf | Q4_0_4_8 | 1.92GB | false | Optimized for ARM inference. Requires 'i8mm' support (see link below). | | Llama-3.2-3B-Instruct-Q4_0_4_4.gguf | Q4_0_4_4 | 1.92GB | false | Optimized for ARM inference. Should work well on all ARM chips, pick this if you're unsure. | | Llama-3.2-3B-Instruct-Q4_0.gguf | Q4_0 | 1.92GB | false | Legacy format, generally not worth using over similarly sized formats | | Llama-3.2-3B-Instruct-Q3_K_XL.gguf | Q3_K_XL | 1.91GB | false | Uses Q8_0 for embed and output weights. Lower quality but usable, good for low RAM availability. | | Llama-3.2-3B-Instruct-IQ4_XS.gguf | IQ4_XS | 1.83GB | false | Decent quality, smaller than Q4_K_S with similar performance, *recommended*. | | Llama-3.2-3B-Instruct-Q3_K_L.gguf | Q3_K_L | 1.82GB | false | Lower quality but usable, good for low RAM availability. | | Llama-3.2-3B-Instruct-IQ3_M.gguf | IQ3_M | 1.60GB | false | Medium-low quality, new method with decent performance comparable to Q3_K_M. | ## Embed/output weights Some of these quants (Q3_K_XL, Q4_K_L etc) are the standard quantization method with the embeddings and output weights quantized to Q8_0 instead of what they would normally default to. Some say that this improves the quality, others don't notice any difference. If you use these models PLEASE COMMENT with your findings. I would like feedback that these are actually used and useful so I don't keep uploading quants no one is using. Thanks! ## Downloading using huggingface-cli First, make sure you have hugginface-cli installed: Then, you can target the specific file you want: If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run: You can either specify a new local-dir (Llama-3.2-3B-Instruct-Q8_0) or download them all in place (./) ## Q4_0_X_X These are *NOT* for Metal (Apple) offloading, only ARM chips. If you're using an ARM chip, the Q4_0_X_X quants will have a substantial speedup. Check out Q4_0_4_4 speed comparisons on the original pull request To check which one would work best for your ARM chip, you can check AArch64 SoC features (thanks EloyOn!). ## Which file should I choose? A great write up with charts showing various performances is provided by Artefact2 here The first thing to figure out is how big a model you can run. To do this, you'll need to figure out how much RAM and/or VRAM you have. If you want your model running as FAST as possible, you'll want to fit the whole thing on your GPU's VRAM. Aim for a quant with a file size 1-2GB smaller than your GPU's total VRAM. If you want the absolute maximum quality, add both your system RAM and your GPU's VRAM together, then similarly grab a quant with a file size 1-2GB Smaller than that total. Next, you'll need to decide if you want to use an 'I-quant' or a 'K-quant'. If you don't want to think too much, grab one of the K-quants. These are in format 'QX_K_X', like Q5_K_M. If you want to get more into the weeds, you can check out this extremely useful feature chart: llama.cpp feature matrix But basically, if you're aiming for below Q4, and you're running cuBLAS (Nvidia) or rocBLAS (AMD), you should look towards the I-quants. These are in format IQX_X, like IQ3_M. These are newer and offer better performance for their size. These I-quants can also be used on CPU and Apple Metal, but will be slower than their K-quant equivalent, so speed vs performance is a tradeoff you'll have to decide. The I-quants are *not* compatible with Vulcan, which is also AMD, so if you have an AMD card double check if you're using the rocBLAS build or the Vulcan build. At the time of writing this, LM Studio has a preview with ROCm support, and other inference engines have specific builds for ROCm. ## Credits Thank you kalomaze and Dampf for assistance in creating the imatrix calibration dataset Thank you ZeroWw for the inspiration to experiment with embed/output Want to support my work? Visit my ko-fi page here:",
27
+ "model_explanation_gemini": "A quantized version of Meta's Llama-3.2-3B-Instruct model for multilingual text generation, optimized for efficiency with GGUF format.\n\n**Features:** \n- Base model: Meta's Llama-3.2-3B-Instruct \n- Languages: English, German, French, Italian, Portuguese, Hindi, Spanish, Thai \n- License: Llama 3.2 Community License \n- Task: Text generation \n- Quantization: GGUF format by bart"
28
+ }
model_data_json/benjamin-paine_stable-diffusion-v1-5-inpainting.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "benjamin-paine/stable-diffusion-v1-5-inpainting",
3
+ "downloads": 256622,
4
+ "tags": [
5
+ "diffusers",
6
+ "safetensors",
7
+ "stable-diffusion",
8
+ "stable-diffusion-diffusers",
9
+ "text-to-image",
10
+ "arxiv:2207.12598",
11
+ "arxiv:2112.10752",
12
+ "arxiv:2103.00020",
13
+ "arxiv:2205.11487",
14
+ "arxiv:1910.09700",
15
+ "license:creativeml-openrail-m",
16
+ "diffusers:StableDiffusionInpaintPipeline",
17
+ "region:us"
18
+ ],
19
+ "description": "--- license: creativeml-openrail-m tags: - stable-diffusion - stable-diffusion-diffusers - text-to-image inference: false library_name: diffusers extra_gated_prompt: |- One more step before getting this model. This model is open access and available to all, with a CreativeML OpenRAIL-M license further specifying rights and usage. The CreativeML OpenRAIL License specifies: 1. You can't use the model to deliberately produce nor share illegal or harmful outputs or content 2. CompVis claims no rights on the outputs you generate, you are free to use them and are accountable for their use which must not go against the provisions set in the license 3. You may re-distribute the weights and use the model commercially and/or as a service. If you do, please be aware you have to include the same use restrictions as the ones in the license and share a copy of the CreativeML OpenRAIL-M to all your users (please read the license entirely and carefully) Please read the full license here: By clicking on \"Access repository\" below, you accept that your *contact information* (email address and username) can be shared with the model authors as well. extra_gated_fields: I have read the License and agree with its terms: checkbox --- # Re-upload This repository is being re-uploaded to HuggingFace in accordance with The CreativeML OpenRAIL-M License under which this repository was originally uploaded, specifically **Section II** which grants: > ...a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model. Note that these files did not come from HuggingFace, but instead from modelscope. As such, some files that were present in the original repository may not be present. File integrity has been verified via checksum. # Original Model Card Stable Diffusion Inpainting is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input, with the extra capability of inpainting the pictures by using a mask. The **Stable-Diffusion-Inpainting** was initialized with the weights of the Stable-Diffusion-v-1-2. First 595k steps regular training, then 440k steps of inpainting training at resolution 512x512 on “laion-aesthetics v2 5+” and 10% dropping of the text-conditioning to improve classifier-free classifier-free guidance sampling. For inpainting, the UNet has 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself) whose weights were zero-initialized after restoring the non-inpainting checkpoint. During training, we generate synthetic masks and in 25% mask everything. :** English - **License:** The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which our license is based. - **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a Latent Diffusion Model that uses a fixed, pretrained text encoder (CLIP ViT-L/14) as suggested in the Imagen paper. - **Resources for more information:** Paper. - **Cite as:** @InProceedings{Rombach_2022_CVPR, author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\\\"orn}, title = {High-Resolution Image Synthesis With Latent Diffusion Models}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2022}, pages = {10684-10695} } # Uses ## Direct Use The model is intended for research purposes only. Possible research areas and tasks include - Safe deployment of models which have the potential to generate harmful content. - Probing and understanding the limitations and biases of generative models. - Generation of artworks and use in design and other artistic processes. - Applications in educational or creative tools. - Research on generative models. Excluded uses are described below. ### Misuse, Malicious Use, and Out-of-Scope Use _Note: This section is taken from the DALLE-MINI model card, but applies in the same way to Stable Diffusion v1_. The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes. #### Out-of-Scope Use The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model. #### Misuse and Malicious Use Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to: - Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc. - Intentionally promoting or propagating discriminatory content or harmful stereotypes. - Impersonating individuals without their consent. - Sexual content without consent of the people who might see it. - Mis- and disinformation - Representations of egregious violence and gore - Sharing of copyrighted or licensed material in violation of its terms of use. - Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use. ## Limitations and Bias ### Limitations - The model does not achieve perfect photorealism - The model cannot render legible text - The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere” - Faces and people in general may not be generated properly. - The model was trained mainly with English captions and will not work as well in other languages. - The autoencoding part of the model is lossy - The model was trained on a large-scale dataset LAION-5B which contains adult material and is not fit for product use without additional safety mechanisms and considerations. - No additional measures were used to deduplicate the dataset. As a result, we observe some degree of memorization for images that are duplicated in the training data. The training data can be searched at to possibly assist in the detection of memorized images. ### Bias While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases. Stable Diffusion v1 was trained on subsets of LAION-2B(en), which consists of images that are primarily limited to English descriptions. Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for. This affects the overall output of the model, as white and western cultures are often set as the default. Further, the ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts. ## Training **Training Data** The model developers used the following dataset for training the model: - LAION-2B (en) and subsets thereof (see next section) **Training Procedure** Stable Diffusion v1 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training, - Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4 - Text prompts are encoded through a ViT-L/14 text-encoder. - The non-pooled output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention. - The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We currently provide six checkpoints, , and , , and which were trained as follows, - : 237k steps at resolution on laion2B-en. 194k steps at resolution on laion-high-resolution (170M examples from LAION-5B with resolution ). - : Resumed from . 515k steps at resolution on \"laion-improved-aesthetics\" (a subset of laion2B-en, filtered to images with an original size , estimated aesthetics score , and an estimated watermark probability . The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an improved aesthetics estimator). - : Resumed from . 195k steps at resolution on \"laion-improved-aesthetics\" and 10\\% dropping of the text-conditioning to improve classifier-free guidance sampling. - : Resumed from stable-diffusion-v1-2.225,000 steps at resolution 512x512 on \"laion-aesthetics v2 5+\" and 10 % dropping of the text-conditioning to classifier-free guidance sampling. - : Resumed from sd-v1-2.ckpt. 595k steps at resolution 512x512 on \"laion-aesthetics v2 5+\" and 10% dropping of the text-conditioning to improve classifier-free guidance sampling. - : Resumed from sd-v1-2.ckpt. 595k steps at resolution 512x512 on \"laion-aesthetics v2 5+\" and 10% dropping of the text-conditioning to improve classifier-free guidance sampling. Then 440k steps of inpainting training at resolution 512x512 on “laion-aesthetics v2 5+” and 10% dropping of the text-conditioning. For inpainting, the UNet has 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself) whose weights were zero-initialized after restoring the non-inpainting checkpoint. During training, we generate synthetic masks and in 25% mask everything. - **Hardware:** 32 x 8 x A100 GPUs - **Optimizer:** AdamW - **Gradient Accumulations**: 2 - **Batch:** 32 x 8 x 2 x 4 = 2048 - **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant ## Evaluation Results Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling steps show the relative improvements of the checkpoints: !pareto Evaluated using 50 PLMS steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores. ## Inpainting Evaluation To assess the performance of the inpainting model, we used the same evaluation protocol as in our LDM paper. Since the Stable Diffusion Inpainting Model acccepts a text input, we simply used a fixed prompt of . | Model | FID | LPIPS | |-----------------------------|------|------------------| | Stable Diffusion Inpainting | 1.00 | 0.141 (+- 0.082) | | Latent Diffusion Inpainting | 1.50 | 0.137 (+- 0.080) | | CoModGAN | 1.82 | 0.15 | | LaMa | 2.21 | 0.134 (+- 0.080) | ## Environmental Impact **Stable Diffusion v1** **Estimated Emissions** Based on that information, we estimate the following CO2 emissions using the Machine Learning Impact calculator presented in Lacoste et al. (2019). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact. - **Hardware Type:** A100 PCIe 40GB - **Hours used:** 150000 - **Cloud Provider:** AWS - **Compute Region:** US-east - **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 11250 kg CO2 eq. ## Citation *This model card was written by: Robin Rombach and Patrick Esser and is based on the DALL-E Mini model card.*",
20
+ "model_explanation_gemini": "Generates and modifies images via text prompts with inpainting capabilities, using masked inputs to edit specific areas while maintaining photo-realism.\n\n**Model Features:** \n- Text-to-image generation with inpainting (mask-based editing) \n- Trained on laion-aesthetics v2 5+ dataset at 512x512 resolution \n- Modified UNet architecture with 5 extra input channels (4 for masked image, 1 for mask) \n- Supports classifier-free guidance sampling \n- Open"
21
+ }