fmegahed commited on
Commit
f6a9c5a
·
verified ·
1 Parent(s): e25d9a2

Our data extraction tool app (version 1.0)

Browse files
Files changed (1) hide show
  1. app.R +234 -48
app.R CHANGED
@@ -1,58 +1,244 @@
1
  library(shiny)
2
- library(bslib)
3
- library(dplyr)
4
- library(ggplot2)
5
 
6
- df <- readr::read_csv("penguins.csv")
7
- # Find subset of columns that are suitable for scatter plot
8
- df_num <- df |> select(where(is.numeric), -Year)
9
 
10
- ui <- page_sidebar(
11
- theme = bs_theme(bootswatch = "minty"),
12
- title = "Penguins explorer",
13
- sidebar = sidebar(
14
- varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
15
- varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
16
- checkboxGroupInput("species", "Filter by species",
17
- choices = unique(df$Species), selected = unique(df$Species)
18
- ),
19
- hr(), # Add a horizontal rule
20
- checkboxInput("by_species", "Show species", TRUE),
21
- checkboxInput("show_margins", "Show marginal plots", TRUE),
22
- checkboxInput("smooth", "Add smoother"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ),
24
- plotOutput("scatter")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
 
27
- server <- function(input, output, session) {
28
- subsetted <- reactive({
29
- req(input$species)
30
- df |> filter(Species %in% input$species)
 
 
 
 
 
 
31
  })
32
-
33
- output$scatter <- renderPlot(
34
- {
35
- p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) +
36
- theme_light() +
37
- list(
38
- theme(legend.position = "bottom"),
39
- if (input$by_species) aes(color = Species),
40
- geom_point(),
41
- if (input$smooth) geom_smooth()
42
- )
43
-
44
- if (input$show_margins) {
45
- margin_type <- if (input$by_species) "density" else "histogram"
46
- p <- p |> ggExtra::ggMarginal(
47
- type = margin_type, margins = "both",
48
- size = 8, groupColour = input$by_species, groupFill = input$by_species
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
-
52
- p
53
- },
54
- res = 100
55
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  }
57
 
58
- shinyApp(ui, server)
 
 
1
  library(shiny)
2
+ library(ellmer)
3
+ library(purrr)
 
4
 
5
+ num_example_fields = 2
 
 
6
 
7
+ # Define UI for the app
8
+ ui = shiny::fluidPage(
9
+ shiny::fluidRow(
10
+ shiny::column(12,
11
+ shiny::div(
12
+ style = "background-color: #f8f9fa; padding: 10px; margin-bottom: 15px; border-radius: 5px;",
13
+ shiny::div(
14
+ style = "display: flex; justify-content: space-between; align-items: center;",
15
+ shiny::div(
16
+ shiny::strong("NHTSA Recall Information Extraction Tool"),
17
+ shiny::p("Version 1.0 - April 2025")
18
+ ),
19
+ shiny::div(
20
+ shiny::p("Authors: Fadel M. Megahed, Ying-Ju (Tessa) Chen"),
21
+ shiny::p("Contact: [email protected]")
22
+ )
23
+ )
24
+ )
25
+ )
26
+ ),
27
+
28
+ shiny::titlePanel("NHTSA Recall Information Extraction"),
29
+
30
+ # Add introduction panel
31
+ shiny::fluidRow(
32
+ shiny::column(12,
33
+ shiny::wellPanel(
34
+ shiny::h4("How to Use This App"),
35
+ shiny::p("This app extracts structured data from NHTSA recall notices using AI. Follow these steps:"),
36
+ shiny::tags$ol(
37
+ shiny::tags$li("Paste recall text containing information you want to extract"),
38
+ shiny::tags$li("Specify the number of fields to extract"),
39
+ shiny::tags$li("Define each field with a label and description"),
40
+ shiny::tags$li("Click 'Extract Data' to process")
41
+ ),
42
+ shiny::p("Example: For extracting recall information, create fields like 'manufacturer', 'models', and 'defect_summary' with clear descriptions."),
43
+ shiny::p("You can process multiple recalls at once: separate each recall text with a double line break (press Enter twice).")
44
+ )
45
+ )
46
  ),
47
+
48
+ shiny::sidebarLayout(
49
+ shiny::sidebarPanel(
50
+ shiny::textAreaInput(
51
+ "input_text",
52
+ "Enter recall text to extract from:",
53
+ rows = 10,
54
+ placeholder = "Paste your recall text here...\n\nSeparate multiple recalls with double line breaks (press Enter twice).\n\nExample: 'Ford Motor Company is recalling certain 2021-2022 vehicles due to faulty brakes.'"
55
+ ),
56
+
57
+ shiny::numericInput(
58
+ "num_fields",
59
+ "Number of fields to extract:",
60
+ value = num_example_fields,
61
+ min = 1,
62
+ max = 10
63
+ ),
64
+
65
+ # Add help text
66
+ shiny::helpText("Define each field with a clear label (e.g., 'manufacturer') and description (e.g., 'The name of the company recalling the vehicles')."),
67
+
68
+ shiny::uiOutput("fields_ui"),
69
+
70
+ # Example button
71
+ shiny::actionButton("load_example", "Load Examples", class = "btn-info"),
72
+ shiny::actionButton("extract_btn", "Extract Data", class = "btn-primary")
73
+ ),
74
+
75
+ shiny::mainPanel(
76
+ shiny::h3("Extracted Recall Data"),
77
+ shiny::p("Results will appear here after extraction"),
78
+ shiny::tableOutput("extracted_table"),
79
+
80
+ # Add tips section
81
+ shiny::wellPanel(
82
+ shiny::h4("Tips for Better Results"),
83
+ shiny::tags$ul(
84
+ shiny::tags$li("Use specific field descriptions to guide the AI"),
85
+ shiny::tags$li("Start with more fields and remove unnecessary ones later"),
86
+ shiny::tags$li("If results are inaccurate, try rephrasing your field descriptions"),
87
+ shiny::tags$li("To process multiple recalls, separate each with a double line break"),
88
+ shiny::tags$li("Each recall text should contain complete information for all fields")
89
+ )
90
+ ),
91
+
92
+ # Add API key notice
93
+ shiny::wellPanel(
94
+ shiny::h4("Note:"),
95
+ shiny::p("To ensure the timeliness of results (since this is hosted on a CPU), we utilize `gpt-4o-mini` for this demo.")
96
+ )
97
+ )
98
+ )
99
  )
100
 
101
+ # Define server logic required to generate dynamic UI and extract data
102
+ server = function(input, output, session) {
103
+
104
+ # Load example data
105
+ shiny::observeEvent(input$load_example, {
106
+ example_text = "Ford Motor Company (Ford) is recalling certain 2021-2022 Bronco vehicles equipped with rearview camera systems and 8-inch screen displays. The rearview camera image may still be displayed after a backing event has ended. As such, these vehicles fail to comply with the requirements of Federal Motor Vehicle Safety Standard number 111, \"Rear Visibility.\"\n\nHonda (American Honda Motor Co.) is recalling certain 2022-2025 Acura MDX Type-S, 2023-2025 Honda Pilot, and 2021-2025 Acura TLX Type-S vehicles. A software error in the fuel injection electronic control unit (FI-ECU) may cause an engine stall or a loss of power."
107
+ shiny::updateTextAreaInput(session, "input_text", value = example_text)
108
+
109
+ # Set up example fields
110
+ shiny::updateNumericInput(session, "num_fields", value = num_example_fields)
111
  })
112
+
113
+ # Dynamically generate UI elements for each field's label and description
114
+ output$fields_ui = shiny::renderUI({
115
+ n = input$num_fields
116
+ if (is.null(n) || n < 1) return(NULL)
117
+
118
+ # Example field definitions for NHTSA recalls
119
+ example_labels = c("manufacturer", "defect_summary", "models", "model_years", "component", "fmvss_number", "root_cause", "risk")
120
+ example_descs = c(
121
+ "The name of the company recalling the vehicles.",
122
+ "Summary of the main defect.",
123
+ "List of affected vehicle models.",
124
+ "List of model years affected.",
125
+ "The part or system affected by the defect.",
126
+ "The FMVSS number mentioned, if any.",
127
+ "The root cause of the defect.",
128
+ "The risk or consequence posed by the defect."
129
+ )
130
+
131
+ fields = purrr::map(1:n, function(i) {
132
+ # Set default values based on examples if available
133
+ default_label = if(i <= length(example_labels)) example_labels[i] else paste0("field", i)
134
+ default_desc = if(i <= length(example_descs)) example_descs[i] else paste0("Description for field ", i)
135
+
136
+ shiny::tagList(
137
+ shiny::textInput(
138
+ paste0("field_label_", i),
139
+ paste("Field", i, "Label:"),
140
+ value = default_label
141
+ ),
142
+ shiny::textInput(
143
+ paste0("field_desc_", i),
144
+ paste("Field", i, "Description:"),
145
+ value = default_desc
146
+ ),
147
+ shiny::hr()
148
+ )
149
+ })
150
+ do.call(shiny::tagList, fields)
151
+ })
152
+
153
+ # Build a custom type_object based on user-specified fields
154
+ create_type_object = shiny::reactive({
155
+ n = input$num_fields
156
+ if (is.null(n) || n < 1) return(NULL)
157
+
158
+ # Build a list of field definitions
159
+ type_list = list()
160
+ for(i in 1:n){
161
+ label = input[[paste0("field_label_", i)]]
162
+ desc = input[[paste0("field_desc_", i)]]
163
+ if (!is.null(label) && label != ""){
164
+ type_list[[label]] = ellmer::type_string(desc, required = FALSE)
165
  }
166
+ }
167
+ # Dynamically create the type object
168
+ do.call(ellmer::type_object, type_list)
169
+ })
170
+
171
+ # When the extract button is clicked, perform extraction
172
+ shiny::observeEvent(input$extract_btn, {
173
+ shiny::req(input$input_text)
174
+
175
+ # Show processing indicator
176
+ shiny::showNotification("Processing extraction request...", type = "message", duration = NULL, id = "extract_notif")
177
+
178
+ custom_type_object = create_type_object()
179
+
180
+ # Initialize the chat object using the OpenAI API key from your environment
181
+ tryCatch({
182
+ # Check if API key is available
183
+ if (Sys.getenv("OPENAI_API_KEY") == "") {
184
+ stop("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
185
+ }
186
+
187
+ chat = ellmer::chat_openai(
188
+ model = 'gpt-4o-mini',
189
+ api_key = Sys.getenv("OPENAI_API_KEY")
190
+ )
191
+
192
+ # Extraction function
193
+ extract_fn = function(x, chat_object, custom_type_object) {
194
+ return(chat_object$extract_data(x, type = custom_type_object))
195
+ }
196
+
197
+ # Split text by double linebreaks to process multiple entities
198
+ text_blocks = unlist(strsplit(input$input_text, "\n\n"))
199
+ text_blocks = text_blocks[text_blocks != ""] # Remove empty blocks
200
+
201
+ # Process each text block
202
+ all_results = list()
203
+
204
+ for (i in seq_along(text_blocks)) {
205
+ result = extract_fn(text_blocks[i], chat, custom_type_object)
206
+ if (is.list(result)) {
207
+ # Add a block_id column to identify the source text block
208
+ result$block_id = i
209
+ all_results[[i]] = result
210
+ }
211
+ }
212
+
213
+ # Combine all results into a single data frame
214
+ if (length(all_results) > 0) {
215
+ combined_results = do.call(rbind, lapply(all_results, function(x) {
216
+ # Ensure all results have the same columns by converting to data frame
217
+ as.data.frame(x)
218
+ }))
219
+
220
+ # Render the output as a table
221
+ output$extracted_table = shiny::renderTable({
222
+ combined_results
223
+ }, rownames = TRUE)
224
+ } else {
225
+ # Handle the case when no valid results are returned
226
+ output$extracted_table = shiny::renderTable({
227
+ data.frame(message = "No valid data could be extracted")
228
+ })
229
+ }
230
+
231
+ # Remove notification
232
+ shiny::removeNotification(id = "extract_notif")
233
+ shiny::showNotification("Extraction complete!", type = "message", duration = 3)
234
+
235
+ }, error = function(e) {
236
+ # Handle errors
237
+ shiny::removeNotification(id = "extract_notif")
238
+ shiny::showNotification(paste("Error:", e$message), type = "error", duration = NULL)
239
+ })
240
+ })
241
  }
242
 
243
+ # Run the Shiny app
244
+ shiny::shinyApp(ui = ui, server = server)