Spaces:
Runtime error
Runtime error
Eric Botti
commited on
Commit
·
9b23edc
1
Parent(s):
e5d260a
updated to use gpt-3.5-turbo, improved streamlit interface
Browse files- .gitignore +1 -2
- app.py +38 -13
- main.py +0 -105
- requirements.txt +0 -0
- setup.py +0 -26
- summarizer.py +116 -0
.gitignore
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
venv
|
2 |
transcript.txt
|
3 |
notes.txt
|
4 |
-
notes.md
|
5 |
-
config.ini
|
|
|
1 |
venv
|
2 |
transcript.txt
|
3 |
notes.txt
|
4 |
+
notes.md
|
|
app.py
CHANGED
@@ -1,24 +1,49 @@
|
|
1 |
-
#
|
2 |
-
from io import StringIO
|
3 |
-
# 3rd party
|
4 |
import streamlit as st
|
5 |
# local
|
6 |
-
import
|
7 |
|
8 |
st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
|
9 |
|
10 |
-
|
|
|
|
|
11 |
|
12 |
-
|
13 |
|
14 |
-
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
upload_stringio = StringIO(upload.getvalue().decode('UTF-8'))
|
18 |
|
19 |
-
|
20 |
|
21 |
-
if
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
st.markdown(notes)
|
|
|
1 |
+
# 3rd party - located in requirements.txt
|
|
|
|
|
2 |
import streamlit as st
|
3 |
# local
|
4 |
+
import summarizer
|
5 |
|
6 |
st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
|
7 |
|
8 |
+
# App Content
|
9 |
+
'''
|
10 |
+
# Transcript Notetaker
|
11 |
|
12 |
+
Upload a transcript of a Google Meet call and this app will use the OpenAI API to generate detailed notes for the meeting.
|
13 |
|
14 |
+
_This program was designed to work with the transcript documents automatically generated by Google Meet meetings, using
|
15 |
+
transcripts with a different format may result in unexpected behavior._
|
16 |
+
'''
|
17 |
|
18 |
+
api_key = st.text_input("Enter your OpenAI API key", type='password')
|
|
|
19 |
|
20 |
+
uploaded_file = st.file_uploader('Upload your Transcript', type='.txt')
|
21 |
|
22 |
+
if api_key and uploaded_file:
|
23 |
+
create_notes_button_disabled = False
|
24 |
+
create_notes_button_help = ''
|
25 |
+
else:
|
26 |
+
create_notes_button_disabled = True
|
27 |
+
create_notes_button_help = "Enter your API key and upload a file to continue"
|
28 |
+
|
29 |
+
button_create_notes = st.button("Create Notes", disabled=create_notes_button_disabled, help=create_notes_button_help)
|
30 |
+
|
31 |
+
meeting_notes = None
|
32 |
+
|
33 |
+
if button_create_notes:
|
34 |
+
|
35 |
+
header, transcript = summarizer.load_transcript(uploaded_file)
|
36 |
+
|
37 |
+
chunks = summarizer.chunk_transcript(transcript)
|
38 |
+
|
39 |
+
summaries = summarizer.summarize_chunks(chunks, api_key)
|
40 |
+
|
41 |
+
meeting_notes = summarizer.format_notes(summaries, header)
|
42 |
+
|
43 |
+
if meeting_notes:
|
44 |
+
st.divider()
|
45 |
+
|
46 |
+
st.download_button("Download Notes", meeting_notes, "notes.md")
|
47 |
+
|
48 |
+
st.markdown(meeting_notes)
|
49 |
|
|
main.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
# standard
|
2 |
-
import configparser
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
import re
|
6 |
-
# 3rd party
|
7 |
-
from langchain.llms import OpenAI
|
8 |
-
from langchain.chat_models import ChatOpenAI
|
9 |
-
from langchain import LLMChain
|
10 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
-
from langchain import PromptTemplate
|
12 |
-
|
13 |
-
# read config
|
14 |
-
config = configparser.ConfigParser()
|
15 |
-
config.read('config.ini')
|
16 |
-
|
17 |
-
# read config variables
|
18 |
-
if not os.getenv("OPENAI_API_KEY"):
|
19 |
-
os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
|
20 |
-
|
21 |
-
# LangChain Config
|
22 |
-
# llm
|
23 |
-
llm = OpenAI(temperature=0)
|
24 |
-
# prompt
|
25 |
-
prompt = PromptTemplate(
|
26 |
-
template="Write a concise summary of the following: {transcript}",
|
27 |
-
input_variables=['transcript']
|
28 |
-
)
|
29 |
-
# chain
|
30 |
-
chain = LLMChain(
|
31 |
-
prompt=prompt,
|
32 |
-
llm=llm,
|
33 |
-
verbose=False
|
34 |
-
)
|
35 |
-
|
36 |
-
|
37 |
-
def load_transcript(input_file):
|
38 |
-
# Google Meet Transcripts have a header which we don't want to be summarized
|
39 |
-
header_lines = 5
|
40 |
-
|
41 |
-
file_text = input_file.readlines()
|
42 |
-
|
43 |
-
head = file_text[:header_lines]
|
44 |
-
transcript = "".join(file_text[header_lines:])
|
45 |
-
|
46 |
-
return head, transcript
|
47 |
-
|
48 |
-
|
49 |
-
def create_meeting_notes(transcript_file):
|
50 |
-
# read config variables
|
51 |
-
# if not os.getenv("OPENAI_API_KEY"):
|
52 |
-
# os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
|
53 |
-
# transcript_filepath = config['OPTIONAL']['transcript-filepath']
|
54 |
-
# notes_filepath = config['OPTIONAL']['notes-filepath']
|
55 |
-
|
56 |
-
head, transcript = load_transcript(transcript_file)
|
57 |
-
|
58 |
-
# split the transcript on the 5-min timestamps
|
59 |
-
regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
|
60 |
-
five_min_chunks = re.split(regex_pattern, transcript)
|
61 |
-
|
62 |
-
# create a textsplitter to subdivide those chunks into appropriately sized chunks.
|
63 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
|
64 |
-
|
65 |
-
# list the meeting time and the chunks associated with it
|
66 |
-
timestamped_summaries = []
|
67 |
-
|
68 |
-
print(f"Summarizing {len(five_min_chunks)*5} minute meeting")
|
69 |
-
start_time = time.time()
|
70 |
-
# summarize the
|
71 |
-
for i, five_minutes_chunk in enumerate(five_min_chunks):
|
72 |
-
timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
|
73 |
-
sub_chunks = text_splitter.split_text(five_minutes_chunk)
|
74 |
-
|
75 |
-
summaries = []
|
76 |
-
for j, chunk in enumerate(sub_chunks, 1):
|
77 |
-
summaries.append(chain.run(chunk))
|
78 |
-
print(f"{timestamp}: Chunk {j}/{len(sub_chunks)}")
|
79 |
-
|
80 |
-
timestamped_summaries.append((timestamp, summaries))
|
81 |
-
|
82 |
-
elapsed_time = time.time() - start_time
|
83 |
-
minutes = elapsed_time // 60
|
84 |
-
print(f"Summarized first {5 * (i+1)} minutes of meeting, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
|
85 |
-
|
86 |
-
first_line = re.split(r"[()]", head[0])
|
87 |
-
|
88 |
-
# Transcript Notes
|
89 |
-
meeting_notes = f'''# {first_line[0]}
|
90 |
-
{first_line[1]}
|
91 |
-
## Attendees
|
92 |
-
{head[2]}## Meeting Notes
|
93 |
-
'''
|
94 |
-
for timestamp, summaries in timestamped_summaries:
|
95 |
-
meeting_notes += f'### {timestamp}\n'
|
96 |
-
for summary in summaries:
|
97 |
-
meeting_notes += f"- {summary.strip()}\n"
|
98 |
-
meeting_notes += "\nEnd of Meeting"
|
99 |
-
|
100 |
-
return meeting_notes
|
101 |
-
|
102 |
-
# with open(notes_filepath, 'w+') as f:
|
103 |
-
# f.write(meeting_notes)
|
104 |
-
|
105 |
-
# print(f"Export to file {notes_filepath} completed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
setup.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Run this script first to install requirements.txt and create config file
|
3 |
-
"""
|
4 |
-
import configparser
|
5 |
-
import sys
|
6 |
-
import subprocess
|
7 |
-
|
8 |
-
# install requirements.txt
|
9 |
-
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'])
|
10 |
-
|
11 |
-
# create default config file
|
12 |
-
config = configparser.ConfigParser()
|
13 |
-
|
14 |
-
# Required
|
15 |
-
config['REQUIRED'] = {
|
16 |
-
"openai-api-key": "Replace this with your key"
|
17 |
-
}
|
18 |
-
|
19 |
-
# Optional
|
20 |
-
config['OPTIONAL'] = {
|
21 |
-
'transcript-filepath': 'transcript.txt',
|
22 |
-
'notes-filepath': 'notes.md'
|
23 |
-
}
|
24 |
-
|
25 |
-
with open('config.ini', 'w') as configfile:
|
26 |
-
config.write(configfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summarizer.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# built in
|
2 |
+
from io import StringIO
|
3 |
+
import re
|
4 |
+
import time
|
5 |
+
# 3rd party - located in requirements.txt
|
6 |
+
import streamlit as st
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
import openai
|
9 |
+
|
10 |
+
HEADER_SIZE = 5 # number of lines in the transcript header
|
11 |
+
CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
|
12 |
+
TEMPERATURE = 0
|
13 |
+
|
14 |
+
|
15 |
+
def load_transcript(input_file):
|
16 |
+
"""Load the text from the transcript uploaded using the file uploader widget"""
|
17 |
+
# transform file from bytes to string
|
18 |
+
input_string = StringIO(input_file.getvalue().decode('UTF-8'))
|
19 |
+
|
20 |
+
# Google Meet Transcripts have a header with info like the meeting title, date, and attendees
|
21 |
+
# We'll want to extract this information separately, instead of having it passed to a summarizer
|
22 |
+
|
23 |
+
file_text = input_string.readlines()
|
24 |
+
|
25 |
+
header = file_text[:HEADER_SIZE]
|
26 |
+
transcript = "".join(file_text[HEADER_SIZE:])
|
27 |
+
|
28 |
+
return header, transcript
|
29 |
+
|
30 |
+
|
31 |
+
def chunk_transcript(transcript: str):
|
32 |
+
# Google Meet transcripts show the timestamp every 5 minutes
|
33 |
+
# split the transcript on the 5-min timestamps
|
34 |
+
timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
|
35 |
+
five_minute_chunks = re.split(timestamp_regex_pattern, transcript)
|
36 |
+
|
37 |
+
# create a textsplitter to subdivide those chunks into appropriately sized chunks.
|
38 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)
|
39 |
+
|
40 |
+
# for each 5 minute chunk divide further into sub-chunks of appropriate length
|
41 |
+
chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]
|
42 |
+
|
43 |
+
# chunks, is a list of lists
|
44 |
+
# outer list represents 5-minute sections of the meeting
|
45 |
+
# inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly
|
46 |
+
|
47 |
+
return chunks
|
48 |
+
|
49 |
+
|
50 |
+
def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
|
51 |
+
"""Create summaries of each chunk of the transcript"""
|
52 |
+
|
53 |
+
system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
|
54 |
+
1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
|
55 |
+
2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
|
56 |
+
3. Rely strictly on the provided text, without including external information.
|
57 |
+
4. Format the summary in paragraph form for easy understanding.
|
58 |
+
5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
|
59 |
+
'''
|
60 |
+
|
61 |
+
total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
|
62 |
+
number_of_summarized_chunks = 0
|
63 |
+
|
64 |
+
progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
|
65 |
+
|
66 |
+
five_minute_summaries = []
|
67 |
+
for sub_chunks in five_minute_chunks:
|
68 |
+
summaries = []
|
69 |
+
for chunk in sub_chunks:
|
70 |
+
if not debug:
|
71 |
+
messages = [
|
72 |
+
{"role": "system", "content": system_prompt},
|
73 |
+
{"role": "user", "content": chunk}
|
74 |
+
]
|
75 |
+
|
76 |
+
response = openai.ChatCompletion.create(
|
77 |
+
model="gpt-3.5-turbo",
|
78 |
+
messages=messages,
|
79 |
+
temperature=TEMPERATURE,
|
80 |
+
api_key=user_api_key
|
81 |
+
)
|
82 |
+
|
83 |
+
summary = response['choices'][0]['message']['content']
|
84 |
+
else:
|
85 |
+
summary = "I would be a meeting note :D"
|
86 |
+
|
87 |
+
# update progress bar
|
88 |
+
number_of_summarized_chunks += 1
|
89 |
+
progress_bar.progress(number_of_summarized_chunks / total_chunks,
|
90 |
+
f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
|
91 |
+
|
92 |
+
summaries.append(summary)
|
93 |
+
|
94 |
+
five_minute_summaries.append(summaries)
|
95 |
+
|
96 |
+
return five_minute_summaries
|
97 |
+
|
98 |
+
|
99 |
+
def format_notes(big_summaries, header):
|
100 |
+
"""Create a string containing the meeting notes in Markdown format"""
|
101 |
+
# The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
|
102 |
+
first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
|
103 |
+
meeting_name = first_line[0]
|
104 |
+
meeting_date = first_line[1]
|
105 |
+
attendees = header[2]
|
106 |
+
|
107 |
+
meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"
|
108 |
+
|
109 |
+
for i, summaries in enumerate(big_summaries):
|
110 |
+
timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
|
111 |
+
|
112 |
+
meeting_notes += f"### {timestamp}\n"
|
113 |
+
for summary in summaries:
|
114 |
+
meeting_notes += f"- {summary.strip()}\n"
|
115 |
+
|
116 |
+
return meeting_notes
|