import { createServerSupabaseClient } from '@/lib/supabase' import { FileSystemItem } from '@/lib/types/database' import crypto from 'crypto' const OPENAI_API_KEY = process.env.OPENAI_API_KEY! const OPENAI_URL = 'https://api.openai.com/v1' // Alternative: Google Gemini embeddings (has free tier) const GOOGLE_API_KEY = process.env.GOOGLE_API_KEY || process.env.NEXT_PUBLIC_GOOGLE_API_KEY const GOOGLE_EMBEDDING_URL = 'https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent' // Fallback: Hugging Face embeddings (completely free) const HF_API_KEY = process.env.HUGGINGFACE_API_KEY || 'hf_' // Free tier available const HF_EMBEDDING_URL = 'https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2' export interface FileChunk { id?: string file_id: string book_id: string chunk_index: number line_start: number line_end: number char_start: number char_end: number content: string content_hash: string embedding?: number[] embedding_model: string } export interface SemanticSearchResult { chunk_id: string file_id: string file_name: string content: string similarity: number line_start: number line_end: number } export class SemanticIndexingService { private supabase: any constructor() { this.supabase = createServerSupabaseClient() } /** * Process a file for semantic indexing */ async processFile(file: FileSystemItem): Promise { try { // Mark file as processing await this.updateFileStatus(file.id, file.book_id!, 'processing') // Calculate content hash const contentHash = this.calculateHash(file.content || '') // Check if file needs re-processing const needsProcessing = await this.needsProcessing(file.id, contentHash) if (!needsProcessing) { await this.updateFileStatus(file.id, file.book_id!, 'completed') return } // Split file into chunks const chunks = this.splitIntoChunks(file.content || '', file.id, file.book_id!) // Generate embeddings for each chunk const chunksWithEmbeddings = await this.generateEmbeddings(chunks) // Store chunks in database await this.storeChunks(chunksWithEmbeddings, file.id) // Update file status await this.updateFileStatus(file.id, file.book_id!, 'completed', contentHash) console.log(`✅ Processed file ${file.name} with ${chunks.length} chunks`) } catch (error) { console.error(`❌ Failed to process file ${file.name}:`, error) await this.updateFileStatus(file.id, file.book_id!, 'failed', undefined, error instanceof Error ? error.message : 'Unknown error') } } /** * Process all files in a book */ async processBook(bookId: string): Promise { try { // Get all files in the book const { data: files, error } = await this.supabase .from('file_system_items') .select('*') .eq('book_id', bookId) .eq('type', 'file') .not('content', 'is', null) if (error) throw error console.log(`📚 Processing ${files?.length || 0} files in book ${bookId}`) // Process files in batches to avoid overwhelming the API const batchSize = 3 for (let i = 0; i < (files?.length || 0); i += batchSize) { const batch = files?.slice(i, i + batchSize) || [] await Promise.all(batch.map((file: FileSystemItem) => this.processFile(file))) // Small delay between batches to be respectful to the API if (i + batchSize < (files?.length || 0)) { await new Promise(resolve => setTimeout(resolve, 1000)) } } console.log(`✅ Completed processing book ${bookId}`) } catch (error) { console.error(`❌ Failed to process book ${bookId}:`, error) throw error } } /** * Perform semantic search on a book */ async semanticSearch( bookId: string, query: string, options: { maxResults?: number similarityThreshold?: number } = {} ): Promise { try { const { maxResults = 20, similarityThreshold = 0.01 } = options // Generate embedding for the query const queryEmbedding = await this.generateSingleEmbedding(query) // Search using the database function const { data: results, error } = await this.supabase .rpc('semantic_search', { search_book_id: bookId, query_embedding: queryEmbedding, similarity_threshold: similarityThreshold, max_results: maxResults }) if (error) { console.error(`Database function error:`, error) throw error } return results || [] } catch (error) { console.error('Semantic search failed:', error) return [] } } /** * Get files that need processing */ async getFilesNeedingProcessing(bookId: string): Promise { const { data: files, error } = await this.supabase .from('file_system_items') .select(` *, embedding_status:file_embedding_status(status, last_content_hash) `) .eq('book_id', bookId) .eq('type', 'file') .not('content', 'is', null) if (error) throw error return files?.filter((file: any) => { const status = file.embedding_status?.[0] const currentHash = this.calculateHash(file.content || '') return !status || status.status === 'pending' || status.status === 'failed' || status.status === 'stale' || status.last_content_hash !== currentHash }) || [] } /** * Split file content into chunks */ private splitIntoChunks(content: string, fileId: string, bookId: string): FileChunk[] { const chunks: FileChunk[] = [] const lines = content.split('\n') // Configuration for chunking const CHUNK_SIZE = 1000 // characters const CHUNK_OVERLAP = 200 // characters overlap between chunks const MIN_CHUNK_SIZE = 100 // minimum chunk size let currentChunk = '' let currentStartLine = 0 let currentCharStart = 0 let chunkIndex = 0 for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) { const line = lines[lineIndex] const lineWithNewline = line + (lineIndex < lines.length - 1 ? '\n' : '') // Check if adding this line would exceed chunk size if (currentChunk.length + lineWithNewline.length > CHUNK_SIZE && currentChunk.length > MIN_CHUNK_SIZE) { // Create chunk const chunk: FileChunk = { file_id: fileId, book_id: bookId, chunk_index: chunkIndex, line_start: currentStartLine + 1, // 1-based line numbers line_end: lineIndex, // Current line (exclusive) char_start: currentCharStart, char_end: currentCharStart + currentChunk.length, content: currentChunk.trim(), content_hash: this.calculateHash(currentChunk.trim()), embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004' } chunks.push(chunk) chunkIndex++ // Start new chunk with overlap const overlapStart = Math.max(0, currentChunk.length - CHUNK_OVERLAP) const overlapText = currentChunk.substring(overlapStart) currentChunk = overlapText + lineWithNewline currentStartLine = lineIndex currentCharStart = currentCharStart + overlapStart } else { currentChunk += lineWithNewline } } // Add final chunk if there's remaining content if (currentChunk.trim().length > MIN_CHUNK_SIZE) { const chunk: FileChunk = { file_id: fileId, book_id: bookId, chunk_index: chunkIndex, line_start: currentStartLine + 1, line_end: lines.length, char_start: currentCharStart, char_end: currentCharStart + currentChunk.length, content: currentChunk.trim(), content_hash: this.calculateHash(currentChunk.trim()), embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004' } chunks.push(chunk) } return chunks } /** * Generate embeddings for chunks */ private async generateEmbeddings(chunks: FileChunk[]): Promise { const batchSize = 10 // Process in batches to avoid API limits const chunksWithEmbeddings: FileChunk[] = [] for (let i = 0; i < chunks.length; i += batchSize) { const batch = chunks.slice(i, i + batchSize) const texts = batch.map(chunk => chunk.content) try { const embeddings = await this.generateBatchEmbeddings(texts) batch.forEach((chunk, index) => { chunk.embedding = embeddings[index] chunksWithEmbeddings.push(chunk) }) // Rate limiting if (i + batchSize < chunks.length) { await new Promise(resolve => setTimeout(resolve, 100)) } } catch (error) { console.error(`Failed to generate embeddings for batch ${i}:`, error) // Don't add chunks without embeddings - this will cause the file processing to fail throw error } } return chunksWithEmbeddings } /** * Generate embeddings using OpenAI with fallback to Google Gemini */ private async generateBatchEmbeddings(texts: string[]): Promise { // Try OpenAI first if (OPENAI_API_KEY) { try { const response = await fetch(`${OPENAI_URL}/embeddings`, { method: 'POST', headers: { 'Authorization': `Bearer ${OPENAI_API_KEY}`, 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'text-embedding-3-small', input: texts, encoding_format: 'float' }) }) if (response.ok) { const data = await response.json() return data.data.map((item: any) => item.embedding) } else { console.log(`OpenAI API error: ${response.status}, falling back to Google Gemini`) } } catch (error) { console.log('OpenAI API failed, falling back to Google Gemini:', error) } } // Fallback to Google Gemini embeddings if (GOOGLE_API_KEY) { try { const embeddings: number[][] = [] // Google API processes one text at a time for (const text of texts) { const response = await fetch(`${GOOGLE_EMBEDDING_URL}?key=${GOOGLE_API_KEY}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'models/text-embedding-004', content: { parts: [{ text }] } }) }) if (!response.ok) { throw new Error(`Google API error: ${response.status} ${response.statusText}`) } const data = await response.json() embeddings.push(data.embedding.values) // Rate limiting for Google API await new Promise(resolve => setTimeout(resolve, 100)) } return embeddings } catch (error) { console.error('Google Gemini API failed:', error) } } throw new Error('No embedding provider available. Please check your API keys for OpenAI or Google.') } /** * Generate single embedding for search queries */ private async generateSingleEmbedding(text: string): Promise { const embeddings = await this.generateBatchEmbeddings([text]) return embeddings[0] } /** * Store chunks in database */ private async storeChunks(chunks: FileChunk[], fileId: string): Promise { // Delete existing chunks for this file const { error: deleteError } = await this.supabase .from('file_chunks') .delete() .eq('file_id', fileId) if (deleteError) { console.error('Failed to delete existing chunks:', deleteError) throw deleteError } // Filter out chunks without embeddings const validChunks = chunks.filter(chunk => chunk.embedding && chunk.embedding.length > 0) if (validChunks.length === 0) { console.warn(`No valid chunks with embeddings to store for file ${fileId}`) return } console.log(`Storing ${validChunks.length} chunks with embeddings for file ${fileId}`) // Insert new chunks const { error } = await this.supabase .from('file_chunks') .insert(validChunks.map(chunk => ({ file_id: chunk.file_id, book_id: chunk.book_id, chunk_index: chunk.chunk_index, line_start: chunk.line_start, line_end: chunk.line_end, char_start: chunk.char_start, char_end: chunk.char_end, content: chunk.content, content_hash: chunk.content_hash, embedding: chunk.embedding, embedding_model: chunk.embedding_model }))) if (error) { console.error('Failed to insert chunks:', error) throw error } console.log(`✅ Successfully stored ${validChunks.length} chunks in database`) } /** * Check if file needs processing */ private async needsProcessing(fileId: string, contentHash: string): Promise { const { data: status } = await this.supabase .from('file_embedding_status') .select('status, last_content_hash') .eq('file_id', fileId) .single() if (!status) return true return status.status !== 'completed' || status.last_content_hash !== contentHash } /** * Update file processing status */ private async updateFileStatus( fileId: string, bookId: string, status: string, contentHash?: string, errorMessage?: string ): Promise { const updateData: any = { status, updated_at: new Date().toISOString() } if (status === 'completed') { updateData.last_processed_at = new Date().toISOString() } if (contentHash) { updateData.last_content_hash = contentHash } if (errorMessage) { updateData.error_message = errorMessage } console.log(`Updating file status: ${fileId} -> ${status}`) const { error } = await this.supabase .from('file_embedding_status') .upsert( { file_id: fileId, book_id: bookId, ...updateData }, { onConflict: 'file_id' } ) if (error) { console.error('Failed to update file status:', error) throw error } else { console.log(`✅ Updated file status: ${fileId} -> ${status}`) } } /** * Calculate SHA-256 hash of content */ private calculateHash(content: string): string { return crypto.createHash('sha256').update(content).digest('hex') } } // Export singleton instance export const semanticIndexingService = new SemanticIndexingService()