burtenshaw commited on
Commit
d338da0
·
1 Parent(s): 99bcf43

simplify script down

Browse files
Files changed (1) hide show
  1. scripts/create_presentation.py +107 -198
scripts/create_presentation.py CHANGED
@@ -1,12 +1,43 @@
1
  import os
2
  import re
3
  from huggingface_hub import InferenceClient
4
- from requests.exceptions import RequestException
5
  import time
6
  import argparse
7
 
8
  # Use the model ID specified in the script's default
9
  DEFAULT_LLM_MODEL = "CohereLabs/c4ai-command-a-03-2025" # Model ID from the error log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def parse_arguments():
@@ -22,202 +53,62 @@ def parse_arguments():
22
  "--output_file",
23
  help="Path to the output presentation file. Defaults to <input_file_name>_presentation.md",
24
  )
 
 
 
 
25
  return parser.parse_args()
26
 
27
 
28
- def read_input_file(filepath):
29
- """Reads content from the specified file."""
30
- if not os.path.exists(filepath):
31
- print(f"Error: Input file not found at {filepath}")
32
- return None
33
- print(f"Reading input file: {filepath}")
34
- try:
35
- with open(filepath, "r", encoding="utf-8") as f:
36
- return f.read()
37
- except Exception as e:
38
- print(f"Error reading file {filepath}: {e}")
39
- return None
40
-
41
-
42
  def generate_presentation_with_llm(
43
- client, llm_model, full_markdown_content, input_filename
44
  ):
45
- """Generates the entire presentation using the LLM."""
46
  if not client:
47
  print("LLM client not available. Cannot generate presentation.")
48
  return None
49
 
50
- # Limit input content length if necessary (though models like Command R+ handle large contexts)
51
- # max_input_len = 100000 # Example limit
52
- # if len(full_markdown_content) > max_input_len:
53
- # print(f"Warning: Input content truncated to {max_input_len} characters for LLM.")
54
- # full_markdown_content = full_markdown_content[:max_input_len]
55
-
56
- prompt = f"""
57
- You are an expert technical writer and presentation creator. Your task is to convert the following Markdown course material into a complete Remark.js presentation file.
58
-
59
- **Input Markdown Content:**
60
-
61
- {full_markdown_content}
62
-
63
- **Instructions:**
64
-
65
- 1. **Structure:** Create slides based on the logical sections of the input markdown. Use `## ` headings in the input as the primary indicator for new slides.
66
- 2. **Slide Format:** Each slide should start with `# Slide Title` derived from the corresponding `## Heading`.
67
- 3. **Content:** Include the relevant text, code blocks (preserving language identifiers like ```python), and lists from the input markdown within each slide.
68
- 4. **Images:** Convert Markdown images `![alt](url)` into Remark.js format: `.center[![alt](url)]`. Ensure the image URL is correct and accessible.
69
- 5. **Presenter Notes:** For each slide, generate concise speaker notes (2-4 sentences) summarizing the key points, definitions, or context. Place these notes after the slide content, separated by `???`.
70
- 6. **Separators:** Separate individual slides using `\n\n---\n\n`.
71
- 7. **Cleanup:** Do NOT include any HTML/MDX specific tags like `<CourseFloatingBanner>`, `<Tip>`, `<Question>`, `<Youtube>`, or internal links like `[[...]]` in the final output. Remove frontmatter if present.
72
- 8. **Start/End:**
73
- * Begin the presentation with a title slide:
74
- ```markdown
75
- class: impact
76
-
77
- # Presentation based on {os.path.basename(input_filename)}
78
- ## Generated Presentation
79
-
80
- .center[![Hugging Face Logo](https://huggingface.co/front/assets/huggingface_logo.svg)]
81
-
82
- ???
83
- This presentation was automatically generated from the content of {os.path.basename(input_filename)}. It covers the key topics discussed in the material.
84
- ```
85
- * End the presentation with a final "Thank You" slide:
86
- ```markdown
87
- class: center, middle
88
-
89
- # Thank You!
90
-
91
- ???
92
- This concludes the presentation generated from the provided material.
93
- ```
94
- 9. **Output:** Provide ONLY the complete Remark.js Markdown content, starting with the title slide and ending with the thank you slide, with all generated slides in between. Do not include any introductory text or explanations before or after the presentation markdown.
95
-
96
- **Generate the Remark.js presentation now:**
97
- """
98
- max_retries = 2
99
- retry_delay = 10 # seconds, generation can take time
100
- for attempt in range(max_retries):
101
- try:
102
- print(
103
- f"Attempting LLM generation (Attempt {attempt + 1}/{max_retries})... This may take a while."
104
- )
105
- # Use the client's chat completion method appropriate for the provider
106
- # For Cohere provider, it might be client.chat.completions.create or similar
107
- # Assuming client.chat_completion works based on previous script structure
108
- completion = client.chat.completions.create(
109
- messages=[{"role": "user", "content": prompt}],
110
- model=llm_model,
111
- max_tokens=8000, # Increase max_tokens significantly for full presentation (adjust based on model limits)
112
- temperature=0.3, # Lower temperature for more deterministic structure following
113
- )
114
- presentation_content = completion.choices[0].message.content.strip()
115
-
116
- # Basic validation: Check if it looks like a remark presentation
117
- if "---" in presentation_content and "???" in presentation_content:
118
- # Attempt to remove potential preamble/postamble from the LLM response
119
- # Find the first 'class: impact' and last 'Thank You!' slide markers
120
- start_match = re.search(r"class:\s*impact", presentation_content)
121
- # Find the end of the "Thank You" slide block more reliably
122
- thank_you_slide_end_index = presentation_content.rfind(
123
- "\n\n???\n"
124
- ) # Look for the notes separator of the last slide
125
-
126
- if start_match and thank_you_slide_end_index != -1:
127
- start_index = start_match.start()
128
- # Find the end of the notes for the thank you slide
129
- # Search for the end of the notes block, which might just be the end of the string
130
- end_of_notes_pattern = re.compile(
131
- r"\n\n(?!(\?\?\?|---))", re.MULTILINE
132
- ) # Look for a double newline not followed by ??? or ---
133
- end_match = end_of_notes_pattern.search(
134
- presentation_content,
135
- thank_you_slide_end_index + len("\n\n???\n"),
136
- )
137
-
138
- if end_match:
139
- end_index = end_match.start() # End before the double newline
140
- else: # If no clear end found after notes, take rest of string
141
- end_index = len(presentation_content)
142
-
143
- presentation_content = presentation_content[
144
- start_index:end_index
145
- ].strip()
146
- print("LLM generation successful.")
147
- return presentation_content
148
- elif start_match: # Fallback if end markers are weird but start is okay
149
- presentation_content = presentation_content[
150
- start_match.start() :
151
- ].strip()
152
- print("LLM generation successful (end marker adjustment needed).")
153
- return presentation_content
154
- else:
155
- print(
156
- "Warning: Generated content might not start correctly. Using full response."
157
- )
158
- return presentation_content # Return raw if markers not found
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  else:
161
  print(
162
- "Warning: Generated content doesn't seem to contain expected Remark.js separators (---, ???)."
163
  )
164
- return presentation_content # Return raw content for inspection
165
-
166
- except RequestException as e:
167
- print(f"API Request Error (Attempt {attempt + 1}/{max_retries}): {e}")
168
- if attempt < max_retries - 1:
169
- print(f"Retrying in {retry_delay} seconds...")
170
- time.sleep(retry_delay)
171
- else:
172
- print("Max retries reached for API request.")
173
- return None
174
- except Exception as e:
175
- print(f"Error during LLM call (Attempt {attempt + 1}/{max_retries}): {e}")
176
- # Attempt to safely access response details if they exist
177
- response_details = ""
178
- if hasattr(e, "response"):
179
- try:
180
- status = getattr(e.response, "status_code", "N/A")
181
- text = getattr(e.response, "text", "N/A")
182
- response_details = f" (Status: {status}, Body: {text[:500]}...)" # Limit body length
183
- except Exception as inner_e:
184
- response_details = (
185
- f" (Could not parse error response details: {inner_e})"
186
- )
187
- print(f"LLM Call Error: {e}{response_details}")
188
-
189
- if attempt < max_retries - 1:
190
- print(f"Retrying in {retry_delay} seconds...")
191
- time.sleep(retry_delay)
192
- else:
193
- print("Max retries reached for LLM call.")
194
- return None
195
-
196
- print("Failed to generate presentation after multiple retries.")
197
- return None
198
-
199
 
200
- def write_output_file(filepath, content):
201
- """Writes the presentation content to the output file."""
202
- if content is None:
203
- print("No content to write.")
204
- return
205
- print(f"\nWriting presentation to: {filepath}")
206
- try:
207
- # Ensure directory exists
208
- output_dir = os.path.dirname(filepath)
209
- if (
210
- output_dir
211
- ): # Ensure output_dir is not empty (happens if writing to current dir)
212
- os.makedirs(output_dir, exist_ok=True)
213
- with open(filepath, "w", encoding="utf-8") as f:
214
- f.write(content)
215
- print("Successfully generated presentation.")
216
  except Exception as e:
217
- print(f"Error writing output file {filepath}: {e}")
218
-
219
-
220
- # --- Main Orchestration ---
221
 
222
 
223
  def main():
@@ -229,35 +120,53 @@ def main():
229
  output_file_path = args.output_file
230
  else:
231
  base_name = os.path.splitext(os.path.basename(args.input_file))[0]
232
- # Place output in the same directory as input by default
233
  output_dir = os.path.dirname(args.input_file)
234
- # Handle case where input file has no directory path
235
  output_file_path = os.path.join(
236
  output_dir or ".", f"{base_name}_presentation.md"
237
  )
238
 
239
- # Get config
240
- hf_api_key = os.environ.get(
241
- "HF_API_KEY",
242
- )
243
  llm_model = os.environ.get("LLM_MODEL", DEFAULT_LLM_MODEL)
244
 
245
- client = InferenceClient(token=hf_api_key, provider="cohere")
246
-
247
- # Read Input
248
- all_content = read_input_file(args.input_file)
 
 
 
 
 
 
 
 
 
 
249
 
250
- if all_content is None:
251
- exit(1) # Exit if file reading failed
 
 
 
252
 
253
- # Generate Presentation using LLM
254
  print(f"Requesting presentation generation from model '{llm_model}'...")
255
  final_presentation_content = generate_presentation_with_llm(
256
- client, llm_model, all_content, args.input_file
257
  )
258
 
259
- # Write Output
260
- write_output_file(output_file_path, final_presentation_content)
 
 
 
 
 
 
 
 
 
261
 
262
  print("Script finished.")
263
 
 
1
  import os
2
  import re
3
  from huggingface_hub import InferenceClient
 
4
  import time
5
  import argparse
6
 
7
  # Use the model ID specified in the script's default
8
  DEFAULT_LLM_MODEL = "CohereLabs/c4ai-command-a-03-2025" # Model ID from the error log
9
+ DEFAULT_PRESENTATION_PROMPT_TEMPLATE = """
10
+ You are an expert technical writer and presentation creator. Your task is to convert the following Markdown course material into a complete Remark.js presentation file.
11
+
12
+ **Input Markdown Content:**
13
+
14
+ {markdown_content}
15
+
16
+ **Instructions:**
17
+
18
+ 1. **Structure:** Create slides based on the logical sections of the input markdown. Use `## ` headings in the input as the primary indicator for new slides.
19
+ 2. **Slide Format:** Each slide should start with `# Slide Title` derived from the corresponding `## Heading`.
20
+ 3. **Content:** Include the relevant text, code blocks (preserving language identifiers like ```python), and lists from the input markdown within each slide.
21
+ 4. **Images:** Convert Markdown images `![alt](url)` into Remark.js format: `.center[![alt](url)]`. Ensure the image URL is correct and accessible.
22
+ 5. **Presenter Notes (Transcription Style):** For each slide, generate a detailed script or **transcription** of what the presenter should say to explain the slide's content. This should be flowing text suitable for reading aloud. Place this transcription after the slide content, separated by `???`.
23
+ 6. **Separators:** Separate individual slides using `\n\n---\n\n`.
24
+ 7. **Cleanup:** Do NOT include any HTML/MDX specific tags like `<CourseFloatingBanner>`, `<Tip>`, `<Question>`, `<Youtube>`, or internal links like `[[...]]`. Remove frontmatter.
25
+ 8. **Start Slide:** Begin the presentation with a title slide:
26
+ ```markdown
27
+ class: impact
28
+
29
+ # Presentation based on {input_filename}
30
+ ## Generated Presentation
31
+
32
+ .center[![Hugging Face Logo](https://huggingface.co/front/assets/huggingface_logo.svg)]
33
+
34
+ ???
35
+ Welcome everyone. This presentation, automatically generated from the course material titled '{input_filename}', will walk you through the key topics discussed in the document. Let's begin.
36
+ ```
37
+ 9. **Output:** Provide ONLY the complete Remark.js Markdown content, starting with the title slide and ending with the last content slide. Do not include any introductory text, explanations, or a final 'Thank You' slide.
38
+
39
+ **Generate the Remark.js presentation now:**
40
+ """
41
 
42
 
43
  def parse_arguments():
 
53
  "--output_file",
54
  help="Path to the output presentation file. Defaults to <input_file_name>_presentation.md",
55
  )
56
+ parser.add_argument(
57
+ "--prompt_template",
58
+ help="Custom prompt template string (use {markdown_content} and {input_filename}). Overrides PRESENTATION_PROMPT env var and default.",
59
+ )
60
  return parser.parse_args()
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def generate_presentation_with_llm(
64
+ client, llm_model, prompt_template, full_markdown_content, input_filename
65
  ):
66
+ """Generates the entire presentation using the LLM based on the provided prompt template."""
67
  if not client:
68
  print("LLM client not available. Cannot generate presentation.")
69
  return None
70
 
71
+ # Format the prompt using the template
72
+ prompt = prompt_template.format(
73
+ markdown_content=full_markdown_content,
74
+ input_filename=os.path.basename(input_filename),
75
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # Removed retry logic
78
+ try:
79
+ print(f"Attempting LLM generation...")
80
+ completion = client.chat.completions.create(
81
+ messages=[{"role": "user", "content": prompt}],
82
+ model=llm_model,
83
+ max_tokens=8000,
84
+ temperature=0.3,
85
+ )
86
+ presentation_content = completion.choices[0].message.content.strip()
87
+
88
+ # Basic validation and cleanup
89
+ if "---" in presentation_content and "???" in presentation_content:
90
+ start_match = re.search(r"class:\s*impact", presentation_content)
91
+ if start_match:
92
+ # Simple cleanup: start from the first slide marker
93
+ presentation_content = presentation_content[
94
+ start_match.start() :
95
+ ].strip()
96
+ print("LLM generation successful.")
97
+ return presentation_content
98
  else:
99
  print(
100
+ "Warning: Generated content might not start correctly. Using full response."
101
  )
102
+ return presentation_content
103
+ else:
104
+ print(
105
+ "Warning: Generated content missing expected separators (---, ???). Using raw response."
106
+ )
107
+ return presentation_content # Return raw content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  except Exception as e:
110
+ print(f"Error during LLM call: {e}")
111
+ return None # Failed
 
 
112
 
113
 
114
  def main():
 
120
  output_file_path = args.output_file
121
  else:
122
  base_name = os.path.splitext(os.path.basename(args.input_file))[0]
 
123
  output_dir = os.path.dirname(args.input_file)
 
124
  output_file_path = os.path.join(
125
  output_dir or ".", f"{base_name}_presentation.md"
126
  )
127
 
128
+ # --- Get Config ---
129
+ hf_api_key = os.environ.get("HF_API_KEY")
 
 
130
  llm_model = os.environ.get("LLM_MODEL", DEFAULT_LLM_MODEL)
131
 
132
+ # Determine prompt template
133
+ if args.prompt_template:
134
+ prompt_template = args.prompt_template
135
+ print("Using custom prompt template from arguments.")
136
+ else:
137
+ prompt_template = os.environ.get(
138
+ "PRESENTATION_PROMPT", DEFAULT_PRESENTATION_PROMPT_TEMPLATE
139
+ )
140
+ if prompt_template == DEFAULT_PRESENTATION_PROMPT_TEMPLATE:
141
+ print("Using default prompt template.")
142
+ else:
143
+ print(
144
+ "Using prompt template from PRESENTATION_PROMPT environment variable."
145
+ )
146
 
147
+ client = InferenceClient(token=hf_api_key, provider="cohere")
148
+ # --- Read Input File ---
149
+ print(f"Reading input file: {args.input_file}")
150
+ with open(args.input_file, "r", encoding="utf-8") as f:
151
+ all_content = f.read()
152
 
153
+ # --- Generate Presentation ---
154
  print(f"Requesting presentation generation from model '{llm_model}'...")
155
  final_presentation_content = generate_presentation_with_llm(
156
+ client, llm_model, prompt_template, all_content, args.input_file
157
  )
158
 
159
+ # --- Write Output File ---
160
+ if final_presentation_content:
161
+ print(f"\nWriting presentation to: {output_file_path}")
162
+ output_dir = os.path.dirname(output_file_path)
163
+ if output_dir:
164
+ os.makedirs(output_dir, exist_ok=True)
165
+ with open(output_file_path, "w", encoding="utf-8") as f:
166
+ f.write(final_presentation_content)
167
+ print("Successfully generated presentation.")
168
+ else:
169
+ print("Generation failed, no output file written.")
170
 
171
  print("Script finished.")
172