feat: Add memory system with SQLite + ChromaDB hybrid storage
- memory_store.py: User-isolated observation storage with vector embeddings - New endpoints: /memory/save, /memory/query, /memory/get, /memory/timeline - Progressive disclosure pattern for token-efficient retrieval - Updated Dockerfile to ROCm 7.2 nightly
This commit is contained in:
168
app/document_processor.py
Normal file
168
app/document_processor.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Document processing utilities for the RAG service.
|
||||
Handles text chunking and extraction from various file formats.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger("moxie-rag.processor")
|
||||
|
||||
# Approximate chars per token for multilingual text
|
||||
CHARS_PER_TOKEN = 4
|
||||
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 500,
|
||||
overlap: int = 50,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split text into chunks of approximately chunk_size tokens with overlap.
|
||||
"""
|
||||
char_size = chunk_size * CHARS_PER_TOKEN
|
||||
char_overlap = overlap * CHARS_PER_TOKEN
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if len(text) <= char_size:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + char_size
|
||||
|
||||
if end < len(text):
|
||||
window = text[start:end]
|
||||
best_break = -1
|
||||
for separator in ["\n\n", ".\n", ". ", "?\n", "? ", "!\n", "! ", "\n", ", ", " "]:
|
||||
pos = window.rfind(separator)
|
||||
if pos > char_size // 2:
|
||||
best_break = pos + len(separator)
|
||||
break
|
||||
if best_break > 0:
|
||||
end = start + best_break
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
next_start = end - char_overlap
|
||||
if next_start <= start:
|
||||
next_start = end
|
||||
start = next_start
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def extract_text_from_pdf(file_path: str) -> str:
|
||||
"""Extract text from a PDF file using pdfplumber."""
|
||||
import pdfplumber
|
||||
|
||||
text_parts = []
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for i, page in enumerate(pdf.pages):
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
else:
|
||||
logger.debug(f"Page {i + 1}: no text extracted")
|
||||
|
||||
result = "\n\n".join(text_parts)
|
||||
logger.info(f"Extracted {len(result)} chars from PDF ({len(text_parts)} pages)")
|
||||
return result
|
||||
|
||||
|
||||
def extract_text_from_docx(file_path: str) -> str:
|
||||
"""Extract text from a DOCX file using python-docx."""
|
||||
from docx import Document
|
||||
|
||||
doc = Document(file_path)
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
result = "\n\n".join(paragraphs)
|
||||
logger.info(f"Extracted {len(result)} chars from DOCX ({len(paragraphs)} paragraphs)")
|
||||
return result
|
||||
|
||||
|
||||
def extract_text_from_excel(file_path: str) -> str:
|
||||
"""Extract text from Excel files (.xlsx, .xls) using openpyxl/pandas."""
|
||||
import pandas as pd
|
||||
|
||||
text_parts = []
|
||||
xls = pd.ExcelFile(file_path)
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
if df.empty:
|
||||
continue
|
||||
|
||||
text_parts.append(f"--- Sheet: {sheet_name} ---")
|
||||
|
||||
# Include column headers
|
||||
headers = " | ".join(str(c) for c in df.columns)
|
||||
text_parts.append(f"Columns: {headers}")
|
||||
|
||||
# Convert rows to readable text
|
||||
for idx, row in df.iterrows():
|
||||
row_text = " | ".join(
|
||||
f"{col}: {val}" for col, val in row.items()
|
||||
if pd.notna(val) and str(val).strip()
|
||||
)
|
||||
if row_text:
|
||||
text_parts.append(row_text)
|
||||
|
||||
result = "\n".join(text_parts)
|
||||
logger.info(f"Extracted {len(result)} chars from Excel ({len(xls.sheet_names)} sheets)")
|
||||
return result
|
||||
|
||||
|
||||
def extract_audio_from_video(video_path: str) -> str:
|
||||
"""Extract audio track from video file using ffmpeg. Returns path to wav file."""
|
||||
audio_path = tempfile.mktemp(suffix=".wav")
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-i", video_path,
|
||||
"-vn", "-acodec", "pcm_s16le",
|
||||
"-ar", "16000", "-ac", "1",
|
||||
"-y", audio_path,
|
||||
],
|
||||
capture_output=True,
|
||||
check=True,
|
||||
timeout=600,
|
||||
)
|
||||
logger.info(f"Extracted audio from video to {audio_path}")
|
||||
return audio_path
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"ffmpeg failed: {e.stderr.decode()}")
|
||||
raise ValueError(f"Could not extract audio from video: {e.stderr.decode()[:200]}")
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str, filename: str) -> str:
|
||||
"""
|
||||
Extract text from a file based on its extension.
|
||||
|
||||
Supported: .pdf, .docx, .doc, .txt, .md, .csv, .json, .html, .xlsx, .xls
|
||||
"""
|
||||
ext = Path(filename).suffix.lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
return extract_text_from_pdf(file_path)
|
||||
elif ext in (".docx", ".doc"):
|
||||
return extract_text_from_docx(file_path)
|
||||
elif ext in (".xlsx", ".xls"):
|
||||
return extract_text_from_excel(file_path)
|
||||
elif ext in (".txt", ".md", ".csv", ".json", ".html", ".xml", ".rst"):
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
logger.info(f"Read {len(content)} chars from {ext} file")
|
||||
return content
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {ext}")
|
||||
Reference in New Issue
Block a user