feat: Add memory system with SQLite + ChromaDB hybrid storage

- memory_store.py: User-isolated observation storage with vector embeddings - New endpoints: /memory/save, /memory/query, /memory/get, /memory/timeline - Progressive disclosure pattern for token-efficient retrieval - Updated Dockerfile to ROCm 7.2 nightly
2026-02-09 15:42:43 -06:00
commit 76d8f9349e
9 changed files with 1751 additions and 0 deletions
--- a/app/document_processor.py
+++ b/app/document_processor.py
@@ -0,0 +1,168 @@
+"""
+Document processing utilities for the RAG service.
+Handles text chunking and extraction from various file formats.
+"""
+
+import logging
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import List
+
+logger = logging.getLogger("moxie-rag.processor")
+
+# Approximate chars per token for multilingual text
+CHARS_PER_TOKEN = 4
+
+
+def chunk_text(
+    text: str,
+    chunk_size: int = 500,
+    overlap: int = 50,
+) -> List[str]:
+    """
+    Split text into chunks of approximately chunk_size tokens with overlap.
+    """
+    char_size = chunk_size * CHARS_PER_TOKEN
+    char_overlap = overlap * CHARS_PER_TOKEN
+
+    text = text.strip()
+    if not text:
+        return []
+
+    if len(text) <= char_size:
+        return [text]
+
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        end = start + char_size
+
+        if end < len(text):
+            window = text[start:end]
+            best_break = -1
+            for separator in ["\n\n", ".\n", ". ", "?\n", "? ", "!\n", "! ", "\n", ", ", " "]:
+                pos = window.rfind(separator)
+                if pos > char_size // 2:
+                    best_break = pos + len(separator)
+                    break
+            if best_break > 0:
+                end = start + best_break
+
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+
+        next_start = end - char_overlap
+        if next_start <= start:
+            next_start = end
+        start = next_start
+
+    return chunks
+
+
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from a PDF file using pdfplumber."""
+    import pdfplumber
+
+    text_parts = []
+    with pdfplumber.open(file_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            page_text = page.extract_text()
+            if page_text:
+                text_parts.append(page_text)
+            else:
+                logger.debug(f"Page {i + 1}: no text extracted")
+
+    result = "\n\n".join(text_parts)
+    logger.info(f"Extracted {len(result)} chars from PDF ({len(text_parts)} pages)")
+    return result
+
+
+def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from a DOCX file using python-docx."""
+    from docx import Document
+
+    doc = Document(file_path)
+    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+    result = "\n\n".join(paragraphs)
+    logger.info(f"Extracted {len(result)} chars from DOCX ({len(paragraphs)} paragraphs)")
+    return result
+
+
+def extract_text_from_excel(file_path: str) -> str:
+    """Extract text from Excel files (.xlsx, .xls) using openpyxl/pandas."""
+    import pandas as pd
+
+    text_parts = []
+    xls = pd.ExcelFile(file_path)
+
+    for sheet_name in xls.sheet_names:
+        df = pd.read_excel(file_path, sheet_name=sheet_name)
+        if df.empty:
+            continue
+
+        text_parts.append(f"--- Sheet: {sheet_name} ---")
+
+        # Include column headers
+        headers = " | ".join(str(c) for c in df.columns)
+        text_parts.append(f"Columns: {headers}")
+
+        # Convert rows to readable text
+        for idx, row in df.iterrows():
+            row_text = " | ".join(
+                f"{col}: {val}" for col, val in row.items()
+                if pd.notna(val) and str(val).strip()
+            )
+            if row_text:
+                text_parts.append(row_text)
+
+    result = "\n".join(text_parts)
+    logger.info(f"Extracted {len(result)} chars from Excel ({len(xls.sheet_names)} sheets)")
+    return result
+
+
+def extract_audio_from_video(video_path: str) -> str:
+    """Extract audio track from video file using ffmpeg. Returns path to wav file."""
+    audio_path = tempfile.mktemp(suffix=".wav")
+    try:
+        subprocess.run(
+            [
+                "ffmpeg", "-i", video_path,
+                "-vn", "-acodec", "pcm_s16le",
+                "-ar", "16000", "-ac", "1",
+                "-y", audio_path,
+            ],
+            capture_output=True,
+            check=True,
+            timeout=600,
+        )
+        logger.info(f"Extracted audio from video to {audio_path}")
+        return audio_path
+    except subprocess.CalledProcessError as e:
+        logger.error(f"ffmpeg failed: {e.stderr.decode()}")
+        raise ValueError(f"Could not extract audio from video: {e.stderr.decode()[:200]}")
+
+
+def extract_text_from_file(file_path: str, filename: str) -> str:
+    """
+    Extract text from a file based on its extension.
+
+    Supported: .pdf, .docx, .doc, .txt, .md, .csv, .json, .html, .xlsx, .xls
+    """
+    ext = Path(filename).suffix.lower()
+
+    if ext == ".pdf":
+        return extract_text_from_pdf(file_path)
+    elif ext in (".docx", ".doc"):
+        return extract_text_from_docx(file_path)
+    elif ext in (".xlsx", ".xls"):
+        return extract_text_from_excel(file_path)
+    elif ext in (".txt", ".md", ".csv", ".json", ".html", ".xml", ".rst"):
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+        logger.info(f"Read {len(content)} chars from {ext} file")
+        return content
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")