Spaces:

ericbotti
/

transcript-notetaker

Runtime error

App Files Files Community

Eric Botti commited on Jul 23, 2023

Commit

9b23edc

1 Parent(s): e5d260a

updated to use gpt-3.5-turbo, improved streamlit interface

Browse files

Files changed (6) hide show

.gitignore +1 -2
app.py +38 -13
main.py +0 -105
requirements.txt +0 -0
setup.py +0 -26
summarizer.py +116 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,4 @@
 venv
 transcript.txt
 notes.txt
-notes.md
-config.ini

 venv
 transcript.txt
 notes.txt
+notes.md

app.py CHANGED Viewed

@@ -1,24 +1,49 @@
-# standard
-from io import StringIO
-# 3rd party
 import streamlit as st
 # local
-import main
 st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
-st.write("Hello World")
-upload = st.file_uploader("Transcript", type='.txt')
-take_notes = st.button("Create Notes")
-if take_notes and upload:
-    upload_stringio = StringIO(upload.getvalue().decode('UTF-8'))
-    notes = main.create_meeting_notes(upload_stringio)
-if notes:
-    st.download_button("Download Notes", notes, "notes.md")
-    st.markdown(notes)

+# 3rd party - located in requirements.txt
 import streamlit as st
 # local
+import summarizer
 st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
+# App Content
+'''
+# Transcript Notetaker
+Upload a transcript of a Google Meet call and this app will use the OpenAI API to generate detailed notes for the meeting.
+_This program was designed to work with the transcript documents automatically generated by Google Meet meetings, using
+transcripts with a different format may result in unexpected behavior._
+'''
+api_key = st.text_input("Enter your OpenAI API key", type='password')
+uploaded_file = st.file_uploader('Upload your Transcript', type='.txt')
+if api_key and uploaded_file:
+    create_notes_button_disabled = False
+    create_notes_button_help = ''
+else:
+    create_notes_button_disabled = True
+    create_notes_button_help = "Enter your API key and upload a file to continue"
+button_create_notes = st.button("Create Notes", disabled=create_notes_button_disabled, help=create_notes_button_help)
+meeting_notes = None
+if button_create_notes:
+    header, transcript = summarizer.load_transcript(uploaded_file)
+    chunks = summarizer.chunk_transcript(transcript)
+    summaries = summarizer.summarize_chunks(chunks, api_key)
+    meeting_notes = summarizer.format_notes(summaries, header)
+if meeting_notes:
+    st.divider()
+    st.download_button("Download Notes", meeting_notes, "notes.md")
+    st.markdown(meeting_notes)

main.py DELETED Viewed

@@ -1,105 +0,0 @@
-# standard
-import configparser
-import os
-import time
-import re
-# 3rd party
-from langchain.llms import OpenAI
-from langchain.chat_models import ChatOpenAI
-from langchain import LLMChain
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain import PromptTemplate
-# read config
-config = configparser.ConfigParser()
-config.read('config.ini')
-# read config variables
-if not os.getenv("OPENAI_API_KEY"):
-    os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
-# LangChain Config
-# llm
-llm = OpenAI(temperature=0)
-# prompt
-prompt = PromptTemplate(
-    template="Write a concise summary of the following: {transcript}",
-    input_variables=['transcript']
-)
-# chain
-chain = LLMChain(
-    prompt=prompt,
-    llm=llm,
-    verbose=False
-)
-def load_transcript(input_file):
-    # Google Meet Transcripts have a header which we don't want to be summarized
-    header_lines = 5
-    file_text = input_file.readlines()
-    head = file_text[:header_lines]
-    transcript = "".join(file_text[header_lines:])
-    return head, transcript
-def create_meeting_notes(transcript_file):
-    # read config variables
-    # if not os.getenv("OPENAI_API_KEY"):
-    #     os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
-    # transcript_filepath = config['OPTIONAL']['transcript-filepath']
-    # notes_filepath = config['OPTIONAL']['notes-filepath']
-    head, transcript = load_transcript(transcript_file)
-    # split the transcript on the 5-min timestamps
-    regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
-    five_min_chunks = re.split(regex_pattern, transcript)
-    # create a textsplitter to subdivide those chunks into appropriately sized chunks.
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
-    # list the meeting time and the chunks associated with it
-    timestamped_summaries = []
-    print(f"Summarizing {len(five_min_chunks)*5} minute meeting")
-    start_time = time.time()
-    # summarize the
-    for i, five_minutes_chunk in enumerate(five_min_chunks):
-        timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
-        sub_chunks = text_splitter.split_text(five_minutes_chunk)
-        summaries = []
-        for j, chunk in enumerate(sub_chunks, 1):
-            summaries.append(chain.run(chunk))
-            print(f"{timestamp}: Chunk {j}/{len(sub_chunks)}")
-        timestamped_summaries.append((timestamp, summaries))
-        elapsed_time = time.time() - start_time
-        minutes = elapsed_time // 60
-        print(f"Summarized first {5 * (i+1)} minutes of meeting, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
-    first_line = re.split(r"[()]", head[0])
-    # Transcript Notes
-    meeting_notes = f'''# {first_line[0]}
-{first_line[1]}
-## Attendees
-{head[2]}## Meeting Notes
-'''
-    for timestamp, summaries in timestamped_summaries:
-        meeting_notes += f'### {timestamp}\n'
-        for summary in summaries:
-            meeting_notes += f"- {summary.strip()}\n"
-    meeting_notes += "\nEnd of Meeting"
-    return meeting_notes
-    # with open(notes_filepath, 'w+') as f:
-    #     f.write(meeting_notes)
-    # print(f"Export to file {notes_filepath} completed")

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

setup.py DELETED Viewed

@@ -1,26 +0,0 @@
-"""
-Run this script first to install requirements.txt and create config file
-"""
-import configparser
-import sys
-import subprocess
-# install requirements.txt
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'])
-# create default config file
-config = configparser.ConfigParser()
-# Required
-config['REQUIRED'] = {
-    "openai-api-key": "Replace this with your key"
-}
-# Optional
-config['OPTIONAL'] = {
-    'transcript-filepath': 'transcript.txt',
-    'notes-filepath': 'notes.md'
-}
-with open('config.ini', 'w') as configfile:
-    config.write(configfile)

summarizer.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# built in
+from io import StringIO
+import re
+import time
+# 3rd party - located in requirements.txt
+import streamlit as st
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import openai
+HEADER_SIZE = 5 # number of lines in the transcript header
+CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
+TEMPERATURE = 0
+def load_transcript(input_file):
+    """Load the text from the transcript uploaded using the file uploader widget"""
+    # transform file from bytes to string
+    input_string = StringIO(input_file.getvalue().decode('UTF-8'))
+    # Google Meet Transcripts have a header with info like the meeting title, date, and attendees
+    # We'll want to extract this information separately, instead of having it passed to a summarizer
+    file_text = input_string.readlines()
+    header = file_text[:HEADER_SIZE]
+    transcript = "".join(file_text[HEADER_SIZE:])
+    return header, transcript
+def chunk_transcript(transcript: str):
+    # Google Meet transcripts show the timestamp every 5 minutes
+    # split the transcript on the 5-min timestamps
+    timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
+    five_minute_chunks = re.split(timestamp_regex_pattern, transcript)
+    # create a textsplitter to subdivide those chunks into appropriately sized chunks.
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)
+    # for each 5 minute chunk divide further into sub-chunks of appropriate length
+    chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]
+    # chunks, is a list of lists
+    # outer list represents 5-minute sections of the meeting
+    # inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly
+    return chunks
+def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
+    """Create summaries of each chunk of the transcript"""
+    system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
+    1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
+    2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
+    3. Rely strictly on the provided text, without including external information.
+    4. Format the summary in paragraph form for easy understanding.
+    5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
+    '''
+    total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
+    number_of_summarized_chunks = 0
+    progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
+    five_minute_summaries = []
+    for sub_chunks in five_minute_chunks:
+        summaries = []
+        for chunk in sub_chunks:
+            if not debug:
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": chunk}
+                ]
+                response = openai.ChatCompletion.create(
+                    model="gpt-3.5-turbo",
+                    messages=messages,
+                    temperature=TEMPERATURE,
+                    api_key=user_api_key
+                )
+                summary = response['choices'][0]['message']['content']
+            else:
+                summary = "I would be a meeting note :D"
+            # update progress bar
+            number_of_summarized_chunks += 1
+            progress_bar.progress(number_of_summarized_chunks / total_chunks,
+                                  f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
+            summaries.append(summary)
+        five_minute_summaries.append(summaries)
+    return five_minute_summaries
+def format_notes(big_summaries, header):
+    """Create a string containing the meeting notes in Markdown format"""
+    # The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
+    first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
+    meeting_name = first_line[0]
+    meeting_date = first_line[1]
+    attendees = header[2]
+    meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"
+    for i, summaries in enumerate(big_summaries):
+        timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
+        meeting_notes += f"### {timestamp}\n"
+        for summary in summaries:
+            meeting_notes += f"- {summary.strip()}\n"
+    return meeting_notes