Spaces:

nyax
/

PhyloLM

Running on Zero

App Files Files Community

Daetheys commited on 8 days ago

Commit

dc83bf2

1 Parent(s): 114995f

Fixed git push issue

Browse files

Files changed (2) hide show

app.py +56 -14
loading.py +19 -7

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 import os
 import numpy as np
 import ujson as json
 from loading import load_data, save_git
 from tools import compute_ordered_matrix
@@ -101,8 +103,8 @@ def search_bar_gr(model_names,slider=True,double_search=False,key=None):
     ret.insert(0,col2)
     return ret
-import spaces
-@spaces.GPU(duration=300)
 def _run(path,genes,N,progress_bar):
     #Load the model
     progress_bar(0.20, desc="Loading Model...",total=100)
@@ -110,28 +112,35 @@ def _run(path,genes,N,progress_bar):
         model,tokenizer = load_model(path)
     except ValueError as e:
             print(f"Error loading model '{path}': {e}")
-            gr.Warning("Model couldn't load. This space currently only works with AutoModelForCausalLM models and trust_remote_code=False. Please check the model architecture and whether it requires the execution of custom code and try again.")
             return None
     except OSError as e:
             print(f"Error loading model '{path}': {e}")
-            gr.Warning("Model doesn't seem to exist on the HuggingFace Hub or might be gated. Please check the model name and try again.")
             return None
     except RuntimeError as e:
             if 'out of memory' in str(e):
                 print(f"Error loading model '{path}': {e}")
-                gr.Warning("Loading the model triggered an out of memory error. It may be too big for the GPU (80Go RAM). Please try again with a smaller model.")
                 return None
             else:
                 print(f"Error loading model '{path}': {e}")
-                gr.Warning("Model couldn't be loaded. Please check the logs or report an issue.")
                 return None
     except Exception as e:
             print(f"Error loading model '{path}': {e}")
-            gr.Warning("Model couldn't be loaded. Please check logs or report an issue.")
             return None
     progress_bar(0.25, desc="Generating data...",total=100)
     for i,output in enumerate(llm_run(model,tokenizer,genes,N)):
-        progress_bar(0.25 + i*(70/len(genes))/100, desc=f"Generating data... {i+1}/{len(genes)}",total=100)
     return output
 def run(path,progress_bar):
@@ -171,7 +180,37 @@ def run(path,progress_bar):
     progress_bar(0.95, desc="Saving data ...",total=100)
     alleles = [[compl[j]['generated_text'][len(gene):][:4] for j in range(len(compl))] for gene,compl in zip(genes,output)]
-    save_git(alleles,genes,path,family)
     progress_bar(1, desc="Done!",total=100)
@@ -222,6 +261,8 @@ def reload_env():
 # Load environment variables
 USERNAME = os.environ['GITHUB_USERNAME']
 TOKEN = os.environ['GITHUB_TOKEN']
@@ -269,7 +310,7 @@ with gr.Blocks(title="PhyloLM", theme=gr.themes.Default()) as demo:
         "- A similarity matrix (values range from 0 = dissimilar to 1 = highly similar). \n"
         "- 2D and 3D scatter plots representing how close or far from each other LLMs are (plotted using UMAP). \n"
         "- A tree to visualize distances between models (distance from leaf A to leaf B in the tree is similar to the distance between the two models)\n\n"
-        "Models are colored according to their family (e.g., LLaMA, OPT, Mistral) for the ones that were in the original paper. Newly added models by users will be colored in grey. "
     )
     # Load models for the dropdown
@@ -322,7 +363,7 @@ with gr.Blocks(title="PhyloLM", theme=gr.themes.Default()) as demo:
     # Submit model section
-    gr.Markdown("## Submitting a Model")
     gr.Markdown(
         "You may contribute new models to this collaborative space using compute resources. "
@@ -341,8 +382,8 @@ with gr.Blocks(title="PhyloLM", theme=gr.themes.Default()) as demo:
     )
     with gr.Group():
-        model_input = gr.Textbox(label="Model", interactive=False)
-        submit_btn = gr.Button("Run PhyloLM", variant="primary",interactive=False)
     # Disclaimer and citation
@@ -386,7 +427,8 @@ url={https://openreview.net/forum?id=rTQNGQxm4K}
     tree_alpha_marker.change(fn=lambda x : slider_changeAlphaMarkers(x,'fig4'), inputs=tree_alpha_marker, outputs=FIGS_OBJECTS)
     # Run PhyloLM button
-    submit_btn.click(fn=prepare_run, inputs=[model_input], outputs=[model_input]).then(fn=reload_env, inputs=[], outputs=FIGS_OBJECTS+ [sim_mat_search_x, sim_mat_search_y, viz_search, tree_search])
     #Set more globals
     SIM_MAT_SEARCH_X = sim_mat_search_x

 import os
 import numpy as np
 import ujson as json
+import time
+from threading import Thread
 from loading import load_data, save_git
 from tools import compute_ordered_matrix
     ret.insert(0,col2)
     return ret
+#import spaces
+#@spaces.GPU(duration=300)
 def _run(path,genes,N,progress_bar):
     #Load the model
     progress_bar(0.20, desc="Loading Model...",total=100)
         model,tokenizer = load_model(path)
     except ValueError as e:
             print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model couldn't load. This space currently only works with AutoModelForCausalLM models and for security reasons cannot execute remote code. Please check the model architecture and whether it too recent and requires the execution of custom code.")
             return None
     except OSError as e:
             print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model doesn't seem to exist on the HuggingFace Hub or might be gated. Please check the model name and its accessibility.")
             return None
     except RuntimeError as e:
             if 'out of memory' in str(e):
                 print(f"Error loading model '{path}': {e}")
+                gr.Warning("Loading the model triggered an out of memory error. It may be too big for the GPU (80Go RAM max). Please verify the size of the model.")
                 return None
             else:
                 print(f"Error loading model '{path}': {e}")
+                gr.Warning("Model couldn't be loaded. Check the logs for what happened or report an issue including the model's name.")
                 return None
     except Exception as e:
             print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model couldn't be loaded. Check the logs for what happened or report an issue including the model's name.")
             return None
     progress_bar(0.25, desc="Generating data...",total=100)
+    time0 = time.perf_counter()
     for i,output in enumerate(llm_run(model,tokenizer,genes,N)):
+        time_elapsed = time.perf_counter()-time0
+        estimated_time_remaining = int(len(genes)*time_elapsed/(i+1))
+        minutes = str(estimated_time_remaining//60)
+        minutes = "0"*(2-min(2,len(minutes))) + minutes
+        seconds = str(estimated_time_remaining%60)
+        seconds = "0"*(2-min(2,len(seconds))) + seconds
+        progress_bar(0.25 + i*(70/len(genes))/100, desc=f"Generating data... {i+1}/{len(genes)} - estimated remaining time {minutes}:{seconds}",total=100)
     return output
 def run(path,progress_bar):
     progress_bar(0.95, desc="Saving data ...",total=100)
     alleles = [[compl[j]['generated_text'][len(gene):][:4] for j in range(len(compl))] for gene,compl in zip(genes,output)]
+    fsave = False
+    for i in range(10): #Trying to push
+        try:
+            save_git(alleles,genes,path,family)
+            fsave = True
+            break
+        except Exception as e:
+            print(f"Error saving data: {e}")
+            #Recloning the repo
+            try:
+                load_data(force_clone=True)
+            except Exception as e:
+                print(f"Error recloning repo: {e}")
+    if not fsave:
+        gr.Warning("Something went wrong with GitHub and data couldn't be sent to the server. Please check the logs. You can save the data manually by clicking the download button and creating a community post with the file or a pull request on the GitHub repository.")
+        def download_data():
+            d = {'family':family,'alleles':alleles}
+            model_name = path
+            data_path = f'math/{model_name}.json'
+            path = os.path.join('Data',data_path)
+            #create the file folder path
+            if not os.path.exists(os.path.dirname(path)):
+                os.makedirs(os.path.dirname(path), exist_ok=True)
+            #Open the file
+            with open(path,'w') as f:
+                json.dump(d,f)
+            # Provide the download link
+            return gr.File.update(value=path, label="Download data", file_name=f"{model_name}.json")
+        gr.Button("Download data",variant="primary").click(fn=download_data, inputs=[], outputs=None)
+        return None
     progress_bar(1, desc="Done!",total=100)
 # Load environment variables
+import dotenv
+dotenv.load_dotenv()
 USERNAME = os.environ['GITHUB_USERNAME']
 TOKEN = os.environ['GITHUB_TOKEN']
         "- A similarity matrix (values range from 0 = dissimilar to 1 = highly similar). \n"
         "- 2D and 3D scatter plots representing how close or far from each other LLMs are (plotted using UMAP). \n"
         "- A tree to visualize distances between models (distance from leaf A to leaf B in the tree is similar to the distance between the two models)\n\n"
+        "Models are colored according to their family (e.g., LLaMA, OPT, Mistral) for the ones that were in the original paper. Models added by users are colored in grey for now. "
     )
     # Load models for the dropdown
     # Submit model section
+    gr.Markdown("## Submit a Model")
     gr.Markdown(
         "You may contribute new models to this collaborative space using compute resources. "
     )
     with gr.Group():
+        model_input = gr.Textbox(label="Model", interactive=True)
+        submit_btn = gr.Button("Run PhyloLM", variant="primary",interactive=True)
     # Disclaimer and citation
     tree_alpha_marker.change(fn=lambda x : slider_changeAlphaMarkers(x,'fig4'), inputs=tree_alpha_marker, outputs=FIGS_OBJECTS)
     # Run PhyloLM button
+    run_event = submit_btn.click(fn=prepare_run, inputs=[model_input], outputs=[model_input]).then(fn=reload_env, inputs=[], outputs=FIGS_OBJECTS+ [sim_mat_search_x, sim_mat_search_y, viz_search, tree_search])
+    #cancel_btn.click(fn=None,inputs=None,outputs=None,cancels=[run_event])
     #Set more globals
     SIM_MAT_SEARCH_X = sim_mat_search_x

loading.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import ujson as json
 import pygit2
 from phylogeny import compute_all_P, compute_sim_matrix
 from plotting import get_color, UNKNOWN_COLOR, DEFAULT_COLOR
@@ -47,12 +49,18 @@ def load_data():
     return data, model_names, families, sim_matrix, colors
-def load_git():
     cred = pygit2.UserPass(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN'])
-    if os.path.exists('Data'):
         repo = pygit2.Repository('Data')
         remote = repo.remotes['origin']  # Use named reference instead of index
-        remote.fetch()
         # Get the current branch name
         branch_name = repo.head.shorthand
@@ -62,10 +70,14 @@ def load_git():
         # Merge the changes into the current branch
         remote_commit = repo.lookup_reference(remote_ref_name).target
-    else:
-        repo = pygit2.clone_repository('https://github.com/PhyloLM/Data', './Data', bare=False, callbacks=GitHubRemoteCallbacks(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN']))
     data_array = []
     model_names = []
     families = []

 import os
 import ujson as json
 import pygit2
+import shutil
+from pygit2.enums import MergeFavor
 from phylogeny import compute_all_P, compute_sim_matrix
 from plotting import get_color, UNKNOWN_COLOR, DEFAULT_COLOR
     return data, model_names, families, sim_matrix, colors
+def load_git(force_clone = False):
     cred = pygit2.UserPass(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN'])
+    if not os.path.exists('Data') or force_clone:
+        # Remove the existing directory if it exists
+        if os.path.exists('Data'):
+            shutil.rmtree('Data')
+        repo = pygit2.clone_repository('https://github.com/PhyloLM/Data', './Data', bare=False, callbacks=GitHubRemoteCallbacks(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN']))
+    else:
         repo = pygit2.Repository('Data')
         remote = repo.remotes['origin']  # Use named reference instead of index
+        fetch_results = remote.fetch()
+        print(fetch_results)
         # Get the current branch name
         branch_name = repo.head.shorthand
         # Merge the changes into the current branch
         remote_commit = repo.lookup_reference(remote_ref_name).target
+        #Resolve conflicts if any : strategy : theirs
+        try:
+            repo.merge(remote_commit)
+        except Exception as e:
+            print(f"Merge error: {e}")
+            # Redownload the repository if merge fails
+            return load_git(force_clone=True)
     data_array = []
     model_names = []
     families = []