Spaces:

nateevo
/

Minuteevo

Runtime error

App Files Files Community

andreinigo commited on Apr 4, 2023

Commit

1b4e9c9

1 Parent(s): 1f4cd81

Upload 2 files

Browse files

Files changed (2) hide show

app.py +225 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import os
+import openai
+import re
+from os.path import splitext, exists
+import nltk
+from nltk.tokenize import word_tokenize
+import gradio as gr
+import backoff
+import markdown
+from docx import Document
+from io import StringIO
+from datetime import datetime
+import tempfile
+nltk.download('punkt')
+os.environ["OPENAI_API_KEY"] = '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'
+openai.api_key = os.getenv("OPENAI_API_KEY")
+def clean_webvtt(filepath: str) -> str:
+    """Clean up the content of a subtitle file (vtt) to a string
+    Args:
+        filepath (str): path to vtt file
+    Returns:
+        str: clean content
+    """
+    # read file content
+    with open(filepath, "r", encoding="utf-8") as fp:
+        content = fp.read()
+    # remove header & empty lines
+    lines = [line.strip() for line in content.split("\n") if line.strip()]
+    lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines
+    # remove indexes
+    lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]
+    # remove tcode
+    #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
+    pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
+    lines = [lines[i] for i in range(len(lines))
+             if not re.match(pattern, lines[i])]
+    # remove timestamps
+    pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
+    lines = [lines[i] for i in range(len(lines))
+             if not re.match(pattern, lines[i])]
+    content = " ".join(lines)
+    # remove duplicate spaces
+    pattern = r"\s+"
+    content = re.sub(pattern, r" ", content)
+    # add space after punctuation marks if it doesn't exist
+    pattern = r"([\.!?])(\w)"
+    content = re.sub(pattern, r"\1 \2", content)
+    return content
+def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
+    """Save clean content of a subtitle file to text file
+    Args:
+        file_in (str): path to vtt file
+        file_out (None, optional): path to text file
+        **kwargs (optional): arguments for other parameters
+            - no_message (bool): do not show message of result.
+                                 Default is False
+    Returns:
+        str: path to text file
+    """
+    # set default values
+    no_message = kwargs.get("no_message", False)
+    if not file_out:
+        filename = splitext(file_in)[0]
+        file_out = "%s.txt" % filename
+        i = 0
+        while exists(file_out):
+            i += 1
+            file_out = "%s_%s.txt" % (filename, i)
+    content = clean_webvtt(file_in)
+    with open(file_out, "w+", encoding="utf-8") as fp:
+        fp.write(content)
+    if not no_message:
+        print("clean content is written to file: %s" % file_out)
+    return file_out
+def get_summary(filepath):
+    filepath = filepath
+    vtt_to_clean_file(filepath)
+def count_tokens(filename):
+    with open(filename, 'r') as f:
+        text = f.read()
+    tokens = word_tokenize(text)
+    return len(tokens)
+def break_up_file(tokens, chunk_size, overlap_size):
+    if len(tokens) <= chunk_size:
+        yield tokens
+    else:
+        chunk = tokens[:chunk_size]
+        yield chunk
+        yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)
+def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
+    with open(filename, 'r') as f:
+        text = f.read()
+    tokens = word_tokenize(text)
+    return list(break_up_file(tokens, chunk_size, overlap_size))
+def convert_to_prompt_text(tokenized_text):
+    prompt_text = " ".join(tokenized_text)
+    prompt_text = prompt_text.replace(" 's", "'s")
+    return prompt_text
+def markdown_to_docx(md_text, output_file):
+    # Convert the Markdown text to HTML
+    html_text = markdown.markdown(md_text)
+    # Create a new Document object
+    doc = Document()
+    # Parse the HTML and add its content to the .docx document
+    for p in html_text.split('</p>'):
+        if '<p>' in p:
+            clean_p = p.replace('<p>', '').strip()
+            if clean_p:
+                doc.add_paragraph(clean_p)
+    # Save the document to the specified file
+    doc.save(output_file)
+@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
+@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
+def summarize_meeting(filepath):
+    filename = filepath
+    token_count = count_tokens(filename)
+    prompt_response = []
+    # Break the text of the meeting transcripts into chunks of 4000 tokens.
+    chunks = break_up_file_to_chunks(filename)
+    # Summarize each chunk.
+    for i, chunk in enumerate(chunks):
+        prompt_request = convert_to_prompt_text(chunks[i])
+        messages = [
+            {"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
+        messages.append({"role": "user", "content": prompt_request})
+        response = openai.ChatCompletion.create(
+            model="gpt-4",
+            messages=messages,
+            temperature=.4,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0
+        )
+        prompt_response.append(
+            response["choices"][0]["message"]['content'].strip())
+    # Consolidate these meeting summaries.
+    prompt_request = "Consolidate these meeting summaries: " + \
+        str(prompt_response)
+    # Summarize the text of the meeting transcripts.
+    messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
+    messages.append({"role": "user", "content": prompt_request})
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        temperature=.4,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0
+    )
+    summary_text = response["choices"][0]["message"]['content'].strip()
+    #outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
+    # Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
+    #markdown_to_docx(
+    #    summary_text, outfilepath)
+    return summary_text
+def summarize_meeting_vtt(file):
+    temp_file_path = file.name
+    summary_text = summarize_meeting(temp_file_path)
+    return summary_text
+demo = gr.Interface(
+    fn=summarize_meeting_vtt,
+    # input
+    inputs=gr.File(label="Archivo .vtt"),
+    # output
+    outputs=[
+        gr.Markdown(label="Resumen de la reunión")
+    ],
+    title="Hexagon Data - Resumen de reuniones con I.A.",
+    description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión.")
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+python-docx
+nltk
+openai
+markdown
+backoff
+docx
+io
+datetime
+tempfile
+os
+re