bookwiz.io / lib / services / semantic-indexing-service.ts
semantic-indexing-service.ts
Raw
import { createServerSupabaseClient } from '@/lib/supabase'
import { FileSystemItem } from '@/lib/types/database'
import crypto from 'crypto'

const OPENAI_API_KEY = process.env.OPENAI_API_KEY!
const OPENAI_URL = 'https://api.openai.com/v1'

// Alternative: Google Gemini embeddings (has free tier)
const GOOGLE_API_KEY = process.env.GOOGLE_API_KEY || process.env.NEXT_PUBLIC_GOOGLE_API_KEY
const GOOGLE_EMBEDDING_URL = 'https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent'

// Fallback: Hugging Face embeddings (completely free)
const HF_API_KEY = process.env.HUGGINGFACE_API_KEY || 'hf_' // Free tier available
const HF_EMBEDDING_URL = 'https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2'

export interface FileChunk {
  id?: string
  file_id: string
  book_id: string
  chunk_index: number
  line_start: number
  line_end: number
  char_start: number
  char_end: number
  content: string
  content_hash: string
  embedding?: number[]
  embedding_model: string
}

export interface SemanticSearchResult {
  chunk_id: string
  file_id: string
  file_name: string
  content: string
  similarity: number
  line_start: number
  line_end: number
}

export class SemanticIndexingService {
  private supabase: any

  constructor() {
    this.supabase = createServerSupabaseClient()
  }

  /**
   * Process a file for semantic indexing
   */
  async processFile(file: FileSystemItem): Promise<void> {
    try {
      // Mark file as processing
      await this.updateFileStatus(file.id, file.book_id!, 'processing')

      // Calculate content hash
      const contentHash = this.calculateHash(file.content || '')

      // Check if file needs re-processing
      const needsProcessing = await this.needsProcessing(file.id, contentHash)
      if (!needsProcessing) {
        await this.updateFileStatus(file.id, file.book_id!, 'completed')
        return
      }

      // Split file into chunks
      const chunks = this.splitIntoChunks(file.content || '', file.id, file.book_id!)

      // Generate embeddings for each chunk
      const chunksWithEmbeddings = await this.generateEmbeddings(chunks)

      // Store chunks in database
      await this.storeChunks(chunksWithEmbeddings, file.id)

      // Update file status
      await this.updateFileStatus(file.id, file.book_id!, 'completed', contentHash)

      console.log(`โœ… Processed file ${file.name} with ${chunks.length} chunks`)
    } catch (error) {
      console.error(`โŒ Failed to process file ${file.name}:`, error)
      await this.updateFileStatus(file.id, file.book_id!, 'failed', undefined, error instanceof Error ? error.message : 'Unknown error')
    }
  }

  /**
   * Process all files in a book
   */
  async processBook(bookId: string): Promise<void> {
    try {
      // Get all files in the book
      const { data: files, error } = await this.supabase
        .from('file_system_items')
        .select('*')
        .eq('book_id', bookId)
        .eq('type', 'file')
        .not('content', 'is', null)

      if (error) throw error

      console.log(`๐Ÿ“š Processing ${files?.length || 0} files in book ${bookId}`)

      // Process files in batches to avoid overwhelming the API
      const batchSize = 3
      for (let i = 0; i < (files?.length || 0); i += batchSize) {
        const batch = files?.slice(i, i + batchSize) || []
        await Promise.all(batch.map((file: FileSystemItem) => this.processFile(file)))
        
        // Small delay between batches to be respectful to the API
        if (i + batchSize < (files?.length || 0)) {
          await new Promise(resolve => setTimeout(resolve, 1000))
        }
      }

      console.log(`โœ… Completed processing book ${bookId}`)
    } catch (error) {
      console.error(`โŒ Failed to process book ${bookId}:`, error)
      throw error
    }
  }

  /**
   * Perform semantic search on a book
   */
  async semanticSearch(
    bookId: string, 
    query: string, 
    options: {
      maxResults?: number
      similarityThreshold?: number
    } = {}
  ): Promise<SemanticSearchResult[]> {
    try {
      const { maxResults = 20, similarityThreshold = 0.01 } = options

      // Generate embedding for the query
      const queryEmbedding = await this.generateSingleEmbedding(query)

      // Search using the database function
      const { data: results, error } = await this.supabase
        .rpc('semantic_search', {
          search_book_id: bookId,
          query_embedding: queryEmbedding,
          similarity_threshold: similarityThreshold,
          max_results: maxResults
        })

      if (error) {
        console.error(`Database function error:`, error)
        throw error
      }

      return results || []
    } catch (error) {
      console.error('Semantic search failed:', error)
      return []
    }
  }

  /**
   * Get files that need processing
   */
  async getFilesNeedingProcessing(bookId: string): Promise<FileSystemItem[]> {
    const { data: files, error } = await this.supabase
      .from('file_system_items')
      .select(`
        *,
        embedding_status:file_embedding_status(status, last_content_hash)
      `)
      .eq('book_id', bookId)
      .eq('type', 'file')
      .not('content', 'is', null)

    if (error) throw error

    return files?.filter((file: any) => {
      const status = file.embedding_status?.[0]
      const currentHash = this.calculateHash(file.content || '')
      
      return !status || 
             status.status === 'pending' || 
             status.status === 'failed' || 
             status.status === 'stale' ||
             status.last_content_hash !== currentHash
    }) || []
  }

  /**
   * Split file content into chunks
   */
  private splitIntoChunks(content: string, fileId: string, bookId: string): FileChunk[] {
    const chunks: FileChunk[] = []
    const lines = content.split('\n')
    
    // Configuration for chunking
    const CHUNK_SIZE = 1000 // characters
    const CHUNK_OVERLAP = 200 // characters overlap between chunks
    const MIN_CHUNK_SIZE = 100 // minimum chunk size

    let currentChunk = ''
    let currentStartLine = 0
    let currentCharStart = 0
    let chunkIndex = 0

    for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
      const line = lines[lineIndex]
      const lineWithNewline = line + (lineIndex < lines.length - 1 ? '\n' : '')
      
      // Check if adding this line would exceed chunk size
      if (currentChunk.length + lineWithNewline.length > CHUNK_SIZE && currentChunk.length > MIN_CHUNK_SIZE) {
        // Create chunk
        const chunk: FileChunk = {
          file_id: fileId,
          book_id: bookId,
          chunk_index: chunkIndex,
          line_start: currentStartLine + 1, // 1-based line numbers
          line_end: lineIndex, // Current line (exclusive)
          char_start: currentCharStart,
          char_end: currentCharStart + currentChunk.length,
          content: currentChunk.trim(),
          content_hash: this.calculateHash(currentChunk.trim()),
          embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004'
        }
        
        chunks.push(chunk)
        chunkIndex++

        // Start new chunk with overlap
        const overlapStart = Math.max(0, currentChunk.length - CHUNK_OVERLAP)
        const overlapText = currentChunk.substring(overlapStart)
        
        currentChunk = overlapText + lineWithNewline
        currentStartLine = lineIndex
        currentCharStart = currentCharStart + overlapStart
      } else {
        currentChunk += lineWithNewline
      }
    }

    // Add final chunk if there's remaining content
    if (currentChunk.trim().length > MIN_CHUNK_SIZE) {
      const chunk: FileChunk = {
        file_id: fileId,
        book_id: bookId,
        chunk_index: chunkIndex,
        line_start: currentStartLine + 1,
        line_end: lines.length,
        char_start: currentCharStart,
        char_end: currentCharStart + currentChunk.length,
        content: currentChunk.trim(),
        content_hash: this.calculateHash(currentChunk.trim()),
        embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004'
      }
      
      chunks.push(chunk)
    }

    return chunks
  }

  /**
   * Generate embeddings for chunks
   */
  private async generateEmbeddings(chunks: FileChunk[]): Promise<FileChunk[]> {
    const batchSize = 10 // Process in batches to avoid API limits
    const chunksWithEmbeddings: FileChunk[] = []

    for (let i = 0; i < chunks.length; i += batchSize) {
      const batch = chunks.slice(i, i + batchSize)
      const texts = batch.map(chunk => chunk.content)

      try {
        const embeddings = await this.generateBatchEmbeddings(texts)
        
        batch.forEach((chunk, index) => {
          chunk.embedding = embeddings[index]
          chunksWithEmbeddings.push(chunk)
        })

        // Rate limiting
        if (i + batchSize < chunks.length) {
          await new Promise(resolve => setTimeout(resolve, 100))
        }
      } catch (error) {
        console.error(`Failed to generate embeddings for batch ${i}:`, error)
        // Don't add chunks without embeddings - this will cause the file processing to fail
        throw error
      }
    }

    return chunksWithEmbeddings
  }

  /**
   * Generate embeddings using OpenAI with fallback to Google Gemini
   */
  private async generateBatchEmbeddings(texts: string[]): Promise<number[][]> {
    // Try OpenAI first
    if (OPENAI_API_KEY) {
      try {
        const response = await fetch(`${OPENAI_URL}/embeddings`, {
          method: 'POST',
          headers: {
            'Authorization': `Bearer ${OPENAI_API_KEY}`,
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({
            model: 'text-embedding-3-small',
            input: texts,
            encoding_format: 'float'
          })
        })

        if (response.ok) {
          const data = await response.json()
          return data.data.map((item: any) => item.embedding)
        } else {
          console.log(`OpenAI API error: ${response.status}, falling back to Google Gemini`)
        }
      } catch (error) {
        console.log('OpenAI API failed, falling back to Google Gemini:', error)
      }
    }

    // Fallback to Google Gemini embeddings
    if (GOOGLE_API_KEY) {
      try {
        const embeddings: number[][] = []
        
        // Google API processes one text at a time
        for (const text of texts) {
          const response = await fetch(`${GOOGLE_EMBEDDING_URL}?key=${GOOGLE_API_KEY}`, {
            method: 'POST',
            headers: {
              'Content-Type': 'application/json'
            },
            body: JSON.stringify({
              model: 'models/text-embedding-004',
              content: {
                parts: [{ text }]
              }
            })
          })

          if (!response.ok) {
            throw new Error(`Google API error: ${response.status} ${response.statusText}`)
          }

          const data = await response.json()
          embeddings.push(data.embedding.values)
          
          // Rate limiting for Google API
          await new Promise(resolve => setTimeout(resolve, 100))
        }
        
        return embeddings
      } catch (error) {
        console.error('Google Gemini API failed:', error)
      }
    }

    throw new Error('No embedding provider available. Please check your API keys for OpenAI or Google.')
  }

  /**
   * Generate single embedding for search queries
   */
  private async generateSingleEmbedding(text: string): Promise<number[]> {
    const embeddings = await this.generateBatchEmbeddings([text])
    return embeddings[0]
  }

  /**
   * Store chunks in database
   */
  private async storeChunks(chunks: FileChunk[], fileId: string): Promise<void> {
    // Delete existing chunks for this file
    const { error: deleteError } = await this.supabase
      .from('file_chunks')
      .delete()
      .eq('file_id', fileId)

    if (deleteError) {
      console.error('Failed to delete existing chunks:', deleteError)
      throw deleteError
    }

    // Filter out chunks without embeddings
    const validChunks = chunks.filter(chunk => chunk.embedding && chunk.embedding.length > 0)
    
    if (validChunks.length === 0) {
      console.warn(`No valid chunks with embeddings to store for file ${fileId}`)
      return
    }

    console.log(`Storing ${validChunks.length} chunks with embeddings for file ${fileId}`)

    // Insert new chunks
    const { error } = await this.supabase
      .from('file_chunks')
      .insert(validChunks.map(chunk => ({
        file_id: chunk.file_id,
        book_id: chunk.book_id,
        chunk_index: chunk.chunk_index,
        line_start: chunk.line_start,
        line_end: chunk.line_end,
        char_start: chunk.char_start,
        char_end: chunk.char_end,
        content: chunk.content,
        content_hash: chunk.content_hash,
        embedding: chunk.embedding,
        embedding_model: chunk.embedding_model
      })))

    if (error) {
      console.error('Failed to insert chunks:', error)
      throw error
    }

    console.log(`โœ… Successfully stored ${validChunks.length} chunks in database`)
  }

  /**
   * Check if file needs processing
   */
  private async needsProcessing(fileId: string, contentHash: string): Promise<boolean> {
    const { data: status } = await this.supabase
      .from('file_embedding_status')
      .select('status, last_content_hash')
      .eq('file_id', fileId)
      .single()

    if (!status) return true

    return status.status !== 'completed' || status.last_content_hash !== contentHash
  }

  /**
   * Update file processing status
   */
  private async updateFileStatus(
    fileId: string, 
    bookId: string,
    status: string, 
    contentHash?: string, 
    errorMessage?: string
  ): Promise<void> {
    const updateData: any = {
      status,
      updated_at: new Date().toISOString()
    }

    if (status === 'completed') {
      updateData.last_processed_at = new Date().toISOString()
    }

    if (contentHash) {
      updateData.last_content_hash = contentHash
    }

    if (errorMessage) {
      updateData.error_message = errorMessage
    }

    console.log(`Updating file status: ${fileId} -> ${status}`)

    const { error } = await this.supabase
      .from('file_embedding_status')
      .upsert(
        { file_id: fileId, book_id: bookId, ...updateData },
        { onConflict: 'file_id' }
      )

    if (error) {
      console.error('Failed to update file status:', error)
      throw error
    } else {
      console.log(`โœ… Updated file status: ${fileId} -> ${status}`)
    }
  }

  /**
   * Calculate SHA-256 hash of content
   */
  private calculateHash(content: string): string {
    return crypto.createHash('sha256').update(content).digest('hex')
  }
}

// Export singleton instance
export const semanticIndexingService = new SemanticIndexingService()