NEXAS commited on
Commit
0a394f8
·
verified ·
1 Parent(s): 4a97e8c

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +10 -1
utils/ingestion.py CHANGED
@@ -29,7 +29,7 @@ class DocumentProcessor:
29
  def setup_document_converter(self):
30
  """Configure document converter to support multiple formats"""
31
  pipeline_options = PdfPipelineOptions()
32
- pipeline_options.do_ocr = True
33
  pipeline_options.do_table_structure = True
34
 
35
  self.converter = DocumentConverter(
@@ -39,6 +39,8 @@ class DocumentProcessor:
39
  InputFormat.DOCX,
40
  InputFormat.HTML,
41
  InputFormat.PPTX,
 
 
42
  ],
43
  format_options={
44
  InputFormat.PDF: PdfFormatOption(
@@ -64,6 +66,13 @@ class DocumentProcessor:
64
  print(f"❌ Conversion failed: {e}")
65
  return None
66
 
 
 
 
 
 
 
 
67
  chunker = HierarchicalChunker()
68
  chunks = list(chunker.chunk(doc))
69
 
 
29
  def setup_document_converter(self):
30
  """Configure document converter to support multiple formats"""
31
  pipeline_options = PdfPipelineOptions()
32
+ pipeline_options.do_ocr = False
33
  pipeline_options.do_table_structure = True
34
 
35
  self.converter = DocumentConverter(
 
39
  InputFormat.DOCX,
40
  InputFormat.HTML,
41
  InputFormat.PPTX,
42
+ InputFormat.TXT, # Added text format
43
+ InputFormat.CSV, # Added CSV format
44
  ],
45
  format_options={
46
  InputFormat.PDF: PdfFormatOption(
 
66
  print(f"❌ Conversion failed: {e}")
67
  return None
68
 
69
+ # Save document as markdown
70
+ output_dir = Path("parsed-doc")
71
+ output_dir.mkdir(parents=True, exist_ok=True)
72
+ doc_filename = Path(file_path).stem
73
+ md_filename = output_dir / f"{doc_filename}.md"
74
+ doc.save_as_markdown(md_filename)
75
+
76
  chunker = HierarchicalChunker()
77
  chunks = list(chunker.chunk(doc))
78