feat: Add memory system with SQLite + ChromaDB hybrid storage
- memory_store.py: User-isolated observation storage with vector embeddings - New endpoints: /memory/save, /memory/query, /memory/get, /memory/timeline - Progressive disclosure pattern for token-efficient retrieval - Updated Dockerfile to ROCm 7.2 nightly
This commit is contained in:
395
app/email_poller.py
Normal file
395
app/email_poller.py
Normal file
@@ -0,0 +1,395 @@
|
||||
import re
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Email poller for Zeus RAG — checks zeus@zz11.net via IMAP,
|
||||
downloads attachments, and ingests them into the RAG service.
|
||||
Also ingests email body text.
|
||||
"""
|
||||
|
||||
import email
|
||||
import email.header
|
||||
import imaplib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime
|
||||
from email.message import Message
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config
|
||||
# ---------------------------------------------------------------------------
|
||||
IMAP_HOST = os.environ.get("IMAP_HOST", "mail.oe74.net")
|
||||
IMAP_PORT = int(os.environ.get("IMAP_PORT", "993"))
|
||||
IMAP_USER = os.environ.get("IMAP_USER", "zeus@zz11.net")
|
||||
IMAP_PASS = os.environ.get("IMAP_PASS", "")
|
||||
RAG_URL = os.environ.get("RAG_URL", "http://moxie-rag:8899")
|
||||
RAG_COLLECTION = os.environ.get("RAG_COLLECTION", "") # empty = default collection
|
||||
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60")) # seconds
|
||||
STATE_FILE = os.environ.get("STATE_FILE", "/app/data/email_state.json")
|
||||
|
||||
# Whitelist of allowed senders (comma-separated email addresses)
|
||||
ALLOWED_SENDERS = os.environ.get("ALLOWED_SENDERS", "")
|
||||
ALLOWED_SENDERS_LIST = [s.strip().lower() for s in ALLOWED_SENDERS.split(",") if s.strip()]
|
||||
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
".pdf", ".docx", ".doc", ".txt", ".md", ".csv", ".json",
|
||||
".xlsx", ".xls", ".html", ".xml",
|
||||
}
|
||||
MEDIA_EXTENSIONS = {
|
||||
".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv",
|
||||
".mp3", ".wav", ".ogg", ".m4a", ".flac", ".aac",
|
||||
}
|
||||
|
||||
LOG_DIR = Path(os.environ.get("LOG_DIR", "/app/logs"))
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_DIR / "email_poller.log"),
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
logger = logging.getLogger("zeus-email-poller")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# State management (track processed emails)
|
||||
# ---------------------------------------------------------------------------
|
||||
def load_state() -> dict:
|
||||
if os.path.exists(STATE_FILE):
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {"processed_uids": [], "last_check": None}
|
||||
|
||||
|
||||
def save_state(state: dict):
|
||||
Path(STATE_FILE).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Email processing
|
||||
# ---------------------------------------------------------------------------
|
||||
def decode_header_value(value: str) -> str:
|
||||
"""Decode MIME encoded header value."""
|
||||
if not value:
|
||||
return ""
|
||||
parts = email.header.decode_header(value)
|
||||
decoded = []
|
||||
for part, charset in parts:
|
||||
if isinstance(part, bytes):
|
||||
decoded.append(part.decode(charset or "utf-8", errors="replace"))
|
||||
else:
|
||||
decoded.append(part)
|
||||
return " ".join(decoded)
|
||||
|
||||
|
||||
def get_email_body(msg: Message) -> str:
|
||||
"""Extract plain text body from email message."""
|
||||
body_parts = []
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
ctype = part.get_content_type()
|
||||
if ctype == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
body_parts.append(payload.decode(charset, errors="replace"))
|
||||
elif ctype == "text/html" and not body_parts:
|
||||
# Fallback to HTML if no plain text
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
body_parts.append(payload.decode(charset, errors="replace"))
|
||||
else:
|
||||
payload = msg.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
body_parts.append(payload.decode(charset, errors="replace"))
|
||||
return "\n".join(body_parts).strip()
|
||||
|
||||
|
||||
def get_attachments(msg: Message) -> list:
|
||||
"""Extract attachments from email message."""
|
||||
attachments = []
|
||||
for part in msg.walk():
|
||||
if part.get_content_maintype() == "multipart":
|
||||
continue
|
||||
filename = part.get_filename()
|
||||
if filename:
|
||||
filename = decode_header_value(filename)
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
attachments.append({"filename": filename, "data": payload})
|
||||
return attachments
|
||||
|
||||
|
||||
def ingest_text(content: str, title: str, source: str, doc_type: str = "email"):
|
||||
"""Send text to the RAG ingest endpoint."""
|
||||
try:
|
||||
payload = {
|
||||
"content": content,
|
||||
"title": title,
|
||||
"source": source,
|
||||
"doc_type": doc_type,
|
||||
"date": datetime.now().isoformat(),
|
||||
}
|
||||
if RAG_COLLECTION:
|
||||
payload["collection"] = RAG_COLLECTION
|
||||
resp = httpx.post(
|
||||
f"{RAG_URL}/ingest",
|
||||
json=payload,
|
||||
timeout=120.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
logger.info(f"Ingested text '{title}': {result.get('chunks_created', 0)} chunks")
|
||||
return result
|
||||
else:
|
||||
logger.error(f"Ingest failed ({resp.status_code}): {resp.text}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error ingesting text: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def ingest_file(filepath: str, filename: str, source: str, doc_type: str = None):
|
||||
"""Send a file to the RAG ingest-file endpoint."""
|
||||
ext = Path(filename).suffix.lower()
|
||||
try:
|
||||
form_data = {
|
||||
"title": filename,
|
||||
"source": source,
|
||||
"doc_type": doc_type or ext.lstrip("."),
|
||||
}
|
||||
if RAG_COLLECTION:
|
||||
form_data["collection"] = RAG_COLLECTION
|
||||
with open(filepath, "rb") as f:
|
||||
resp = httpx.post(
|
||||
f"{RAG_URL}/ingest-file",
|
||||
files={"file": (filename, f)},
|
||||
data=form_data,
|
||||
timeout=300.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
logger.info(f"Ingested file '{filename}': {result.get('chunks_created', 0)} chunks")
|
||||
return result
|
||||
else:
|
||||
logger.error(f"File ingest failed ({resp.status_code}): {resp.text}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error ingesting file '{filename}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def transcribe_and_ingest(filepath: str, filename: str, source: str):
|
||||
"""Send audio/video to transcribe endpoint with auto_ingest=true."""
|
||||
try:
|
||||
form_data = {
|
||||
"auto_ingest": "true",
|
||||
"title": f"Transcription: {filename}",
|
||||
"source": source,
|
||||
}
|
||||
if RAG_COLLECTION:
|
||||
form_data["collection"] = RAG_COLLECTION
|
||||
with open(filepath, "rb") as f:
|
||||
resp = httpx.post(
|
||||
f"{RAG_URL}/transcribe",
|
||||
files={"file": (filename, f)},
|
||||
data=form_data,
|
||||
timeout=600.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
logger.info(
|
||||
f"Transcribed+ingested '{filename}': "
|
||||
f"{result.get('word_count', 0)} words, "
|
||||
f"{result.get('chunks_created', 0)} chunks"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
logger.error(f"Transcribe failed ({resp.status_code}): {resp.text}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error transcribing '{filename}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def process_email(uid: str, msg: Message) -> dict:
|
||||
"""Process a single email: extract body and attachments, ingest everything."""
|
||||
subject = decode_header_value(msg.get("Subject", "No Subject"))
|
||||
sender = decode_header_value(msg.get("From", "Unknown"))
|
||||
date_str = msg.get("Date", datetime.now().isoformat())
|
||||
source = f"email:{sender}"
|
||||
|
||||
logger.info(f"Processing email UID={uid}: '{subject}' from {sender}")
|
||||
|
||||
# Check sender whitelist
|
||||
if ALLOWED_SENDERS_LIST:
|
||||
sender_email = sender.lower()
|
||||
# Extract email from "Name <email@domain.com>" format
|
||||
email_match = re.search(r'<([^>]+)>', sender_email)
|
||||
if email_match:
|
||||
sender_email = email_match.group(1)
|
||||
|
||||
if sender_email not in ALLOWED_SENDERS_LIST:
|
||||
logger.warning(f"Rejecting email from {sender}: not in whitelist")
|
||||
return {"uid": uid, "subject": subject, "sender": sender, "rejected": True, "reason": "sender_not_allowed"}
|
||||
|
||||
results = {"uid": uid, "subject": subject, "sender": sender, "ingested": []}
|
||||
|
||||
# 1. Ingest email body
|
||||
body = get_email_body(msg)
|
||||
if body and len(body.strip()) > 20:
|
||||
title = f"Email: {subject}"
|
||||
content = f"From: {sender}\nDate: {date_str}\nSubject: {subject}\n\n{body}"
|
||||
r = ingest_text(content, title, source, doc_type="email")
|
||||
if r:
|
||||
results["ingested"].append({"type": "body", "title": title, **r})
|
||||
|
||||
# 2. Process attachments
|
||||
attachments = get_attachments(msg)
|
||||
for att in attachments:
|
||||
filename = att["filename"]
|
||||
ext = Path(filename).suffix.lower()
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
||||
tmp.write(att["data"])
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
att_source = f"email-attachment:{sender}:{filename}"
|
||||
|
||||
if ext in SUPPORTED_EXTENSIONS:
|
||||
r = ingest_file(tmp_path, filename, att_source)
|
||||
if r:
|
||||
results["ingested"].append({"type": "file", "filename": filename, **r})
|
||||
|
||||
elif ext in MEDIA_EXTENSIONS:
|
||||
r = transcribe_and_ingest(tmp_path, filename, att_source)
|
||||
if r:
|
||||
results["ingested"].append({"type": "media", "filename": filename, **r})
|
||||
|
||||
else:
|
||||
logger.warning(f"Skipping unsupported attachment: {filename} ({ext})")
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_emails():
|
||||
"""Connect to IMAP, fetch unread emails, process them."""
|
||||
state = load_state()
|
||||
processed = set(state.get("processed_uids", []))
|
||||
|
||||
logger.info(f"Connecting to {IMAP_HOST}:{IMAP_PORT} as {IMAP_USER}...")
|
||||
|
||||
try:
|
||||
imap = imaplib.IMAP4_SSL(IMAP_HOST, IMAP_PORT)
|
||||
imap.login(IMAP_USER, IMAP_PASS)
|
||||
imap.select("INBOX")
|
||||
|
||||
# Search for UNSEEN messages
|
||||
status, data = imap.search(None, "UNSEEN")
|
||||
if status != "OK":
|
||||
logger.error(f"IMAP search failed: {status}")
|
||||
return
|
||||
|
||||
message_nums = data[0].split()
|
||||
if not message_nums:
|
||||
logger.info("No new emails.")
|
||||
imap.logout()
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(message_nums)} unread email(s)")
|
||||
|
||||
for num in message_nums:
|
||||
# Get UID
|
||||
status, uid_data = imap.fetch(num, "(UID)")
|
||||
if status != "OK":
|
||||
continue
|
||||
uid = uid_data[0].decode().split("UID ")[1].split(")")[0].strip()
|
||||
|
||||
if uid in processed:
|
||||
logger.info(f"Skipping already-processed UID={uid}")
|
||||
continue
|
||||
|
||||
# Fetch full message
|
||||
status, msg_data = imap.fetch(num, "(RFC822)")
|
||||
if status != "OK":
|
||||
continue
|
||||
|
||||
raw_email = msg_data[0][1]
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
|
||||
try:
|
||||
result = process_email(uid, msg)
|
||||
processed.add(uid)
|
||||
total_ingested = len(result.get("ingested", []))
|
||||
logger.info(
|
||||
f"Email UID={uid} processed: "
|
||||
f"{total_ingested} item(s) ingested"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing UID={uid}: {e}", exc_info=True)
|
||||
|
||||
imap.logout()
|
||||
|
||||
except imaplib.IMAP4.error as e:
|
||||
logger.error(f"IMAP error: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error: {e}", exc_info=True)
|
||||
|
||||
# Save state
|
||||
state["processed_uids"] = list(processed)[-500:] # Keep last 500
|
||||
state["last_check"] = datetime.now().isoformat()
|
||||
save_state(state)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main loop
|
||||
# ---------------------------------------------------------------------------
|
||||
def main():
|
||||
if not IMAP_PASS:
|
||||
logger.error("IMAP_PASS not set! Cannot connect to email.")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"Email Poller starting — checking {IMAP_USER} every {POLL_INTERVAL}s")
|
||||
logger.info(f"RAG endpoint: {RAG_URL}")
|
||||
if RAG_COLLECTION:
|
||||
logger.info(f"Target collection: {RAG_COLLECTION}")
|
||||
else:
|
||||
logger.info("Target collection: default")
|
||||
|
||||
# Wait for RAG service to be ready
|
||||
for attempt in range(30):
|
||||
try:
|
||||
resp = httpx.get(f"{RAG_URL}/health", timeout=5.0)
|
||||
if resp.status_code == 200:
|
||||
logger.info("RAG service is ready!")
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"Waiting for RAG service... (attempt {attempt + 1}/30)")
|
||||
time.sleep(5)
|
||||
else:
|
||||
logger.error("RAG service not available after 150s, starting anyway")
|
||||
|
||||
while True:
|
||||
try:
|
||||
check_emails()
|
||||
except Exception as e:
|
||||
logger.error(f"Poll cycle error: {e}", exc_info=True)
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user