Files
moxie-rag/app/email_poller.py
Adolfo Delorenzo 76d8f9349e feat: Add memory system with SQLite + ChromaDB hybrid storage
- memory_store.py: User-isolated observation storage with vector embeddings
- New endpoints: /memory/save, /memory/query, /memory/get, /memory/timeline
- Progressive disclosure pattern for token-efficient retrieval
- Updated Dockerfile to ROCm 7.2 nightly
2026-02-09 15:42:43 -06:00

396 lines
13 KiB
Python

import re
#!/usr/bin/env python3
"""
Email poller for Zeus RAG — checks zeus@zz11.net via IMAP,
downloads attachments, and ingests them into the RAG service.
Also ingests email body text.
"""
import email
import email.header
import imaplib
import json
import logging
import os
import sys
import tempfile
import time
from datetime import datetime
from email.message import Message
from pathlib import Path
import httpx
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
IMAP_HOST = os.environ.get("IMAP_HOST", "mail.oe74.net")
IMAP_PORT = int(os.environ.get("IMAP_PORT", "993"))
IMAP_USER = os.environ.get("IMAP_USER", "zeus@zz11.net")
IMAP_PASS = os.environ.get("IMAP_PASS", "")
RAG_URL = os.environ.get("RAG_URL", "http://moxie-rag:8899")
RAG_COLLECTION = os.environ.get("RAG_COLLECTION", "") # empty = default collection
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60")) # seconds
STATE_FILE = os.environ.get("STATE_FILE", "/app/data/email_state.json")
# Whitelist of allowed senders (comma-separated email addresses)
ALLOWED_SENDERS = os.environ.get("ALLOWED_SENDERS", "")
ALLOWED_SENDERS_LIST = [s.strip().lower() for s in ALLOWED_SENDERS.split(",") if s.strip()]
SUPPORTED_EXTENSIONS = {
".pdf", ".docx", ".doc", ".txt", ".md", ".csv", ".json",
".xlsx", ".xls", ".html", ".xml",
}
MEDIA_EXTENSIONS = {
".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv",
".mp3", ".wav", ".ogg", ".m4a", ".flac", ".aac",
}
LOG_DIR = Path(os.environ.get("LOG_DIR", "/app/logs"))
LOG_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[
logging.FileHandler(LOG_DIR / "email_poller.log"),
logging.StreamHandler(sys.stdout),
],
)
logger = logging.getLogger("zeus-email-poller")
# ---------------------------------------------------------------------------
# State management (track processed emails)
# ---------------------------------------------------------------------------
def load_state() -> dict:
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {"processed_uids": [], "last_check": None}
def save_state(state: dict):
Path(STATE_FILE).parent.mkdir(parents=True, exist_ok=True)
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
# ---------------------------------------------------------------------------
# Email processing
# ---------------------------------------------------------------------------
def decode_header_value(value: str) -> str:
"""Decode MIME encoded header value."""
if not value:
return ""
parts = email.header.decode_header(value)
decoded = []
for part, charset in parts:
if isinstance(part, bytes):
decoded.append(part.decode(charset or "utf-8", errors="replace"))
else:
decoded.append(part)
return " ".join(decoded)
def get_email_body(msg: Message) -> str:
"""Extract plain text body from email message."""
body_parts = []
if msg.is_multipart():
for part in msg.walk():
ctype = part.get_content_type()
if ctype == "text/plain":
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
body_parts.append(payload.decode(charset, errors="replace"))
elif ctype == "text/html" and not body_parts:
# Fallback to HTML if no plain text
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
body_parts.append(payload.decode(charset, errors="replace"))
else:
payload = msg.get_payload(decode=True)
if payload:
charset = msg.get_content_charset() or "utf-8"
body_parts.append(payload.decode(charset, errors="replace"))
return "\n".join(body_parts).strip()
def get_attachments(msg: Message) -> list:
"""Extract attachments from email message."""
attachments = []
for part in msg.walk():
if part.get_content_maintype() == "multipart":
continue
filename = part.get_filename()
if filename:
filename = decode_header_value(filename)
payload = part.get_payload(decode=True)
if payload:
attachments.append({"filename": filename, "data": payload})
return attachments
def ingest_text(content: str, title: str, source: str, doc_type: str = "email"):
"""Send text to the RAG ingest endpoint."""
try:
payload = {
"content": content,
"title": title,
"source": source,
"doc_type": doc_type,
"date": datetime.now().isoformat(),
}
if RAG_COLLECTION:
payload["collection"] = RAG_COLLECTION
resp = httpx.post(
f"{RAG_URL}/ingest",
json=payload,
timeout=120.0,
)
if resp.status_code == 200:
result = resp.json()
logger.info(f"Ingested text '{title}': {result.get('chunks_created', 0)} chunks")
return result
else:
logger.error(f"Ingest failed ({resp.status_code}): {resp.text}")
except Exception as e:
logger.error(f"Error ingesting text: {e}")
return None
def ingest_file(filepath: str, filename: str, source: str, doc_type: str = None):
"""Send a file to the RAG ingest-file endpoint."""
ext = Path(filename).suffix.lower()
try:
form_data = {
"title": filename,
"source": source,
"doc_type": doc_type or ext.lstrip("."),
}
if RAG_COLLECTION:
form_data["collection"] = RAG_COLLECTION
with open(filepath, "rb") as f:
resp = httpx.post(
f"{RAG_URL}/ingest-file",
files={"file": (filename, f)},
data=form_data,
timeout=300.0,
)
if resp.status_code == 200:
result = resp.json()
logger.info(f"Ingested file '{filename}': {result.get('chunks_created', 0)} chunks")
return result
else:
logger.error(f"File ingest failed ({resp.status_code}): {resp.text}")
except Exception as e:
logger.error(f"Error ingesting file '{filename}': {e}")
return None
def transcribe_and_ingest(filepath: str, filename: str, source: str):
"""Send audio/video to transcribe endpoint with auto_ingest=true."""
try:
form_data = {
"auto_ingest": "true",
"title": f"Transcription: {filename}",
"source": source,
}
if RAG_COLLECTION:
form_data["collection"] = RAG_COLLECTION
with open(filepath, "rb") as f:
resp = httpx.post(
f"{RAG_URL}/transcribe",
files={"file": (filename, f)},
data=form_data,
timeout=600.0,
)
if resp.status_code == 200:
result = resp.json()
logger.info(
f"Transcribed+ingested '{filename}': "
f"{result.get('word_count', 0)} words, "
f"{result.get('chunks_created', 0)} chunks"
)
return result
else:
logger.error(f"Transcribe failed ({resp.status_code}): {resp.text}")
except Exception as e:
logger.error(f"Error transcribing '{filename}': {e}")
return None
def process_email(uid: str, msg: Message) -> dict:
"""Process a single email: extract body and attachments, ingest everything."""
subject = decode_header_value(msg.get("Subject", "No Subject"))
sender = decode_header_value(msg.get("From", "Unknown"))
date_str = msg.get("Date", datetime.now().isoformat())
source = f"email:{sender}"
logger.info(f"Processing email UID={uid}: '{subject}' from {sender}")
# Check sender whitelist
if ALLOWED_SENDERS_LIST:
sender_email = sender.lower()
# Extract email from "Name <email@domain.com>" format
email_match = re.search(r'<([^>]+)>', sender_email)
if email_match:
sender_email = email_match.group(1)
if sender_email not in ALLOWED_SENDERS_LIST:
logger.warning(f"Rejecting email from {sender}: not in whitelist")
return {"uid": uid, "subject": subject, "sender": sender, "rejected": True, "reason": "sender_not_allowed"}
results = {"uid": uid, "subject": subject, "sender": sender, "ingested": []}
# 1. Ingest email body
body = get_email_body(msg)
if body and len(body.strip()) > 20:
title = f"Email: {subject}"
content = f"From: {sender}\nDate: {date_str}\nSubject: {subject}\n\n{body}"
r = ingest_text(content, title, source, doc_type="email")
if r:
results["ingested"].append({"type": "body", "title": title, **r})
# 2. Process attachments
attachments = get_attachments(msg)
for att in attachments:
filename = att["filename"]
ext = Path(filename).suffix.lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
tmp.write(att["data"])
tmp_path = tmp.name
try:
att_source = f"email-attachment:{sender}:{filename}"
if ext in SUPPORTED_EXTENSIONS:
r = ingest_file(tmp_path, filename, att_source)
if r:
results["ingested"].append({"type": "file", "filename": filename, **r})
elif ext in MEDIA_EXTENSIONS:
r = transcribe_and_ingest(tmp_path, filename, att_source)
if r:
results["ingested"].append({"type": "media", "filename": filename, **r})
else:
logger.warning(f"Skipping unsupported attachment: {filename} ({ext})")
finally:
os.unlink(tmp_path)
return results
def check_emails():
"""Connect to IMAP, fetch unread emails, process them."""
state = load_state()
processed = set(state.get("processed_uids", []))
logger.info(f"Connecting to {IMAP_HOST}:{IMAP_PORT} as {IMAP_USER}...")
try:
imap = imaplib.IMAP4_SSL(IMAP_HOST, IMAP_PORT)
imap.login(IMAP_USER, IMAP_PASS)
imap.select("INBOX")
# Search for UNSEEN messages
status, data = imap.search(None, "UNSEEN")
if status != "OK":
logger.error(f"IMAP search failed: {status}")
return
message_nums = data[0].split()
if not message_nums:
logger.info("No new emails.")
imap.logout()
return
logger.info(f"Found {len(message_nums)} unread email(s)")
for num in message_nums:
# Get UID
status, uid_data = imap.fetch(num, "(UID)")
if status != "OK":
continue
uid = uid_data[0].decode().split("UID ")[1].split(")")[0].strip()
if uid in processed:
logger.info(f"Skipping already-processed UID={uid}")
continue
# Fetch full message
status, msg_data = imap.fetch(num, "(RFC822)")
if status != "OK":
continue
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
try:
result = process_email(uid, msg)
processed.add(uid)
total_ingested = len(result.get("ingested", []))
logger.info(
f"Email UID={uid} processed: "
f"{total_ingested} item(s) ingested"
)
except Exception as e:
logger.error(f"Error processing UID={uid}: {e}", exc_info=True)
imap.logout()
except imaplib.IMAP4.error as e:
logger.error(f"IMAP error: {e}")
except Exception as e:
logger.error(f"Unexpected error: {e}", exc_info=True)
# Save state
state["processed_uids"] = list(processed)[-500:] # Keep last 500
state["last_check"] = datetime.now().isoformat()
save_state(state)
# ---------------------------------------------------------------------------
# Main loop
# ---------------------------------------------------------------------------
def main():
if not IMAP_PASS:
logger.error("IMAP_PASS not set! Cannot connect to email.")
sys.exit(1)
logger.info(f"Email Poller starting — checking {IMAP_USER} every {POLL_INTERVAL}s")
logger.info(f"RAG endpoint: {RAG_URL}")
if RAG_COLLECTION:
logger.info(f"Target collection: {RAG_COLLECTION}")
else:
logger.info("Target collection: default")
# Wait for RAG service to be ready
for attempt in range(30):
try:
resp = httpx.get(f"{RAG_URL}/health", timeout=5.0)
if resp.status_code == 200:
logger.info("RAG service is ready!")
break
except Exception:
pass
logger.info(f"Waiting for RAG service... (attempt {attempt + 1}/30)")
time.sleep(5)
else:
logger.error("RAG service not available after 150s, starting anyway")
while True:
try:
check_emails()
except Exception as e:
logger.error(f"Poll cycle error: {e}", exc_info=True)
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()