cgoncalves commited on
Commit
2f77fb3
·
verified ·
1 Parent(s): 62f7bff

add agents and prompts

Browse files
Files changed (2) hide show
  1. agents.py +217 -0
  2. prompts.py +26 -0
agents.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from datetime import datetime
4
+ from typing import Annotated
5
+
6
+ from dotenv import load_dotenv
7
+ from pydantic import BaseModel, Field
8
+
9
+ from langchain_core.messages import SystemMessage
10
+ from langchain_core.tools import tool
11
+ from langchain_google_genai import ChatGoogleGenerativeAI
12
+ from langchain_openai import ChatOpenAI
13
+
14
+ from langgraph.graph import END, START, MessagesState, StateGraph
15
+ from langgraph.graph.message import add_messages
16
+ from langgraph.prebuilt import ToolNode, tools_condition
17
+ from langgraph_supervisor.supervisor import create_supervisor
18
+
19
+ from youtube_transcript_api import (
20
+ NoTranscriptFound,
21
+ TranscriptsDisabled,
22
+ VideoUnavailable,
23
+ YouTubeTranscriptApi,
24
+ )
25
+
26
+ from prompts import WEB_SEARCH_PROMPT, YOUTUBE_PROMPT, MULTIMODAL_PROMPT
27
+
28
+ # Load environment variables from .env file
29
+ load_dotenv()
30
+
31
+ # Initialize OpenAI LLM (gpt-4o) for general and web search tasks
32
+ openai_llm = ChatOpenAI(
33
+ model="gpt-4o",
34
+ use_responses_api=True,
35
+ api_key=os.getenv("OPENAI_API_KEY")
36
+ )
37
+
38
+ # Initialize Google Gemini LLM for YouTube and multimodal tasks
39
+ google_llm = ChatGoogleGenerativeAI(
40
+ model="gemini-2.5-flash-preview-04-17",
41
+ google_api_key=os.getenv("GOOGLE_API_KEY"),
42
+ )
43
+
44
+ class AgentState(MessagesState):
45
+ """
46
+ State class for agent workflows, tracks the message history.
47
+ """
48
+ messages: Annotated[list, add_messages]
49
+
50
+ class YouTubeTranscriptInput(BaseModel):
51
+ """
52
+ Input schema for the YouTube transcript tool.
53
+ """
54
+ video_url: str = Field(description="YouTube URL or video ID.")
55
+ raw: bool = Field(default=False, description="Include timestamps?")
56
+
57
+ @tool("youtube_transcript", args_schema=YouTubeTranscriptInput)
58
+ def youtube_transcript(video_url: str, raw: bool = False) -> str:
59
+ """
60
+ Fetches the transcript of a YouTube video given its URL or ID.
61
+ Returns plain text (no timestamps) or raw with timestamps.
62
+ """
63
+ # Extract video ID from URL or use as-is if already an ID
64
+ if "youtube.com" in video_url or "youtu.be" in video_url:
65
+ match = re.search(r"(?:v=|youtu.be/)([\w-]{11})", video_url)
66
+ if not match:
67
+ return "Invalid YouTube URL or ID."
68
+ video_id = match.group(1)
69
+ else:
70
+ video_id = video_url.strip()
71
+ try:
72
+ # Fetch transcript using the API
73
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
74
+ if raw:
75
+ # Return transcript with timestamps
76
+ return "\n".join(f"{int(e['start'])}s: {e['text']}" for e in transcript)
77
+ # Return plain transcript text
78
+ return " ".join(e['text'] for e in transcript)
79
+ except TranscriptsDisabled:
80
+ return "Transcripts are disabled for this video."
81
+ except NoTranscriptFound:
82
+ return "No transcript found for this video."
83
+ except VideoUnavailable:
84
+ return "This video is unavailable."
85
+ except Exception as e:
86
+ return f"An error occurred while fetching the transcript: {e}"
87
+
88
+ # List of available tools for the agent (currently only YouTube transcript)
89
+ tools = [youtube_transcript]
90
+
91
+ def create_web_search_graph() -> StateGraph:
92
+ """
93
+ Create the web search agent graph.
94
+
95
+ Returns:
96
+ StateGraph: The compiled web search agent workflow.
97
+ """
98
+ web_search_preview = [{"type": "web_search_preview"}]
99
+ # Bind the web search tool to the OpenAI LLM
100
+ llm_with_tools = openai_llm.bind_tools(web_search_preview)
101
+
102
+ def agent_node(state: AgentState) -> dict:
103
+ """
104
+ Node function for handling web search queries.
105
+
106
+ Args:
107
+ state (AgentState): The current agent state.
108
+
109
+ Returns:
110
+ dict: Updated state with the LLM response.
111
+ """
112
+ current_date = datetime.now().strftime("%B %d, %Y")
113
+ # Format the system prompt with the current date
114
+ system_message = SystemMessage(content=WEB_SEARCH_PROMPT.format(current_date=current_date))
115
+ # Re-bind tools for each invocation (defensive)
116
+ web_search_preview = [{"type": "web_search_preview"}]
117
+ response = llm_with_tools.bind_tools(web_search_preview).invoke(
118
+ [system_message] + state.get("messages")
119
+ )
120
+ return {"messages": state.get("messages") + [response]}
121
+
122
+ # Build the workflow graph
123
+ workflow = StateGraph(AgentState)
124
+ workflow.add_node("agent", agent_node)
125
+ workflow.add_edge(START, "agent")
126
+ workflow.add_edge("agent", END)
127
+ return workflow.compile(name="web_search_agent")
128
+
129
+ def create_youtube_viwer_graph() -> StateGraph:
130
+ """
131
+ Create the YouTube viewer agent graph.
132
+
133
+ Returns:
134
+ StateGraph: The compiled YouTube viewer agent workflow.
135
+ """
136
+ def agent_node(state: AgentState) -> dict:
137
+ """
138
+ Node function for handling YouTube-related queries.
139
+
140
+ Args:
141
+ state (AgentState): The current agent state.
142
+
143
+ Returns:
144
+ dict: Updated state with the LLM response.
145
+ """
146
+ current_date = datetime.now().strftime("%B %d, %Y")
147
+ # Format the system prompt with the current date
148
+ system_message = SystemMessage(content=YOUTUBE_PROMPT.format(current_date=current_date))
149
+ # Bind the YouTube transcript tool to the Gemini LLM
150
+ llm_with_tools = google_llm.bind_tools(tools)
151
+ response = llm_with_tools.invoke([system_message] + state.get("messages"))
152
+ return {"messages": state.get("messages") + [response]}
153
+
154
+ # Build the workflow graph with tool node and conditional routing
155
+ workflow = StateGraph(AgentState)
156
+ workflow.add_node("llm", agent_node)
157
+ workflow.add_node("tools", ToolNode(tools))
158
+ workflow.set_entry_point("llm")
159
+ workflow.add_conditional_edges(
160
+ "llm",
161
+ tools_condition,
162
+ {
163
+ "tools": "tools", # If tool is needed, go to tools node
164
+ "__end__": END, # Otherwise, end the workflow
165
+ },
166
+ )
167
+ workflow.add_edge("tools", "llm") # After tool, return to LLM node
168
+ return workflow.compile(name="youtube_viwer_agent")
169
+
170
+ def create_multimodal_agent_graph() -> StateGraph:
171
+ """
172
+ Create the multimodal agent graph using Gemini for best multimodal support.
173
+
174
+ Returns:
175
+ StateGraph: The compiled multimodal agent workflow.
176
+ """
177
+ def agent_node(state: AgentState) -> dict:
178
+ """
179
+ Node function for handling multimodal queries.
180
+
181
+ Args:
182
+ state (AgentState): The current agent state.
183
+
184
+ Returns:
185
+ dict: Updated state with the LLM response.
186
+ """
187
+ current_date = datetime.now().strftime("%B %d, %Y")
188
+ # Compose the system message with the multimodal prompt and current date
189
+ system_message = SystemMessage(content=MULTIMODAL_PROMPT + f" Today's date: {current_date}.")
190
+ messages = [system_message] + state.get("messages")
191
+ # Invoke Gemini LLM for multimodal reasoning
192
+ response = google_llm.invoke(messages)
193
+ return {"messages": state.get("messages") + [response]}
194
+
195
+ # Build the workflow graph
196
+ workflow = StateGraph(AgentState)
197
+ workflow.add_node("agent", agent_node)
198
+ workflow.add_edge(START, "agent")
199
+ workflow.add_edge("agent", END)
200
+ return workflow.compile(name="multimodal_agent")
201
+
202
+ # Instantiate the agent graphs
203
+ multimodal_agent = create_multimodal_agent_graph()
204
+ web_search_agent = create_web_search_graph()
205
+ youtube_agent = create_youtube_viwer_graph()
206
+
207
+ # Create the supervisor workflow to route queries to the appropriate sub-agent
208
+ supervisor_workflow = create_supervisor(
209
+ [web_search_agent, youtube_agent, multimodal_agent],
210
+ model=openai_llm,
211
+ prompt=(
212
+ "You are a supervisor. For each question, call one of your sub-agents and return their answer directly to the user. Do not modify, summarize, or rephrase the answer."
213
+ )
214
+ )
215
+
216
+ # Compile the supervisor agent for use in the application
217
+ supervisor_agent = supervisor_workflow.compile(name="supervisor_agent")
prompts.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agent prompts."""
2
+
3
+ WEB_SEARCH_PROMPT = (
4
+ "You are a web search expert. Today's date: {current_date}. "
5
+ "Always search online for the user's question and provide a direct, concise answer. "
6
+ "If the question includes an image, file, or code, analyze it as part of your answer if possible. "
7
+ "If information is unavailable, state so clearly. "
8
+ "Never ask the user questions. Do not say 'I don't know.' Do not provide suggestions or follow-up questions."
9
+ )
10
+
11
+ YOUTUBE_PROMPT = (
12
+ "You are a YouTube/video expert. Today's date: {current_date}. "
13
+ "If a YouTube link is provided, watch the video and answer the user's question directly. "
14
+ "Only answer questions that require information from YouTube or online videos. "
15
+ "Always answer directly and concisely. Never ask the user questions. "
16
+ "If the information is not available in the video, state so clearly. "
17
+ "Do not say 'I don't know.' Do not provide suggestions or follow-up questions."
18
+ )
19
+
20
+ MULTIMODAL_PROMPT = (
21
+ "You are a multimodal expert. If the question includes an image, file, code, or audio, "
22
+ "analyze it and provide a direct, concise answer. "
23
+ "You can process and analyze images, files, code, and audio if present in the question. "
24
+ "If you cannot answer, state so clearly. "
25
+ "Never ask the user questions, never say 'I don't know', and never provide suggestions or follow-up questions."
26
+ )