import { createServerSupabaseClient } from '@/lib/supabase'
import { FileSystemItem } from '@/lib/types/database'
import crypto from 'crypto'
const OPENAI_API_KEY = process.env.OPENAI_API_KEY!
const OPENAI_URL = 'https://api.openai.com/v1'
// Alternative: Google Gemini embeddings (has free tier)
const GOOGLE_API_KEY = process.env.GOOGLE_API_KEY || process.env.NEXT_PUBLIC_GOOGLE_API_KEY
const GOOGLE_EMBEDDING_URL = 'https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent'
// Fallback: Hugging Face embeddings (completely free)
const HF_API_KEY = process.env.HUGGINGFACE_API_KEY || 'hf_' // Free tier available
const HF_EMBEDDING_URL = 'https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2'
export interface FileChunk {
id?: string
file_id: string
book_id: string
chunk_index: number
line_start: number
line_end: number
char_start: number
char_end: number
content: string
content_hash: string
embedding?: number[]
embedding_model: string
}
export interface SemanticSearchResult {
chunk_id: string
file_id: string
file_name: string
content: string
similarity: number
line_start: number
line_end: number
}
export class SemanticIndexingService {
private supabase: any
constructor() {
this.supabase = createServerSupabaseClient()
}
/**
* Process a file for semantic indexing
*/
async processFile(file: FileSystemItem): Promise<void> {
try {
// Mark file as processing
await this.updateFileStatus(file.id, file.book_id!, 'processing')
// Calculate content hash
const contentHash = this.calculateHash(file.content || '')
// Check if file needs re-processing
const needsProcessing = await this.needsProcessing(file.id, contentHash)
if (!needsProcessing) {
await this.updateFileStatus(file.id, file.book_id!, 'completed')
return
}
// Split file into chunks
const chunks = this.splitIntoChunks(file.content || '', file.id, file.book_id!)
// Generate embeddings for each chunk
const chunksWithEmbeddings = await this.generateEmbeddings(chunks)
// Store chunks in database
await this.storeChunks(chunksWithEmbeddings, file.id)
// Update file status
await this.updateFileStatus(file.id, file.book_id!, 'completed', contentHash)
console.log(`โ
Processed file ${file.name} with ${chunks.length} chunks`)
} catch (error) {
console.error(`โ Failed to process file ${file.name}:`, error)
await this.updateFileStatus(file.id, file.book_id!, 'failed', undefined, error instanceof Error ? error.message : 'Unknown error')
}
}
/**
* Process all files in a book
*/
async processBook(bookId: string): Promise<void> {
try {
// Get all files in the book
const { data: files, error } = await this.supabase
.from('file_system_items')
.select('*')
.eq('book_id', bookId)
.eq('type', 'file')
.not('content', 'is', null)
if (error) throw error
console.log(`๐ Processing ${files?.length || 0} files in book ${bookId}`)
// Process files in batches to avoid overwhelming the API
const batchSize = 3
for (let i = 0; i < (files?.length || 0); i += batchSize) {
const batch = files?.slice(i, i + batchSize) || []
await Promise.all(batch.map((file: FileSystemItem) => this.processFile(file)))
// Small delay between batches to be respectful to the API
if (i + batchSize < (files?.length || 0)) {
await new Promise(resolve => setTimeout(resolve, 1000))
}
}
console.log(`โ
Completed processing book ${bookId}`)
} catch (error) {
console.error(`โ Failed to process book ${bookId}:`, error)
throw error
}
}
/**
* Perform semantic search on a book
*/
async semanticSearch(
bookId: string,
query: string,
options: {
maxResults?: number
similarityThreshold?: number
} = {}
): Promise<SemanticSearchResult[]> {
try {
const { maxResults = 20, similarityThreshold = 0.01 } = options
// Generate embedding for the query
const queryEmbedding = await this.generateSingleEmbedding(query)
// Search using the database function
const { data: results, error } = await this.supabase
.rpc('semantic_search', {
search_book_id: bookId,
query_embedding: queryEmbedding,
similarity_threshold: similarityThreshold,
max_results: maxResults
})
if (error) {
console.error(`Database function error:`, error)
throw error
}
return results || []
} catch (error) {
console.error('Semantic search failed:', error)
return []
}
}
/**
* Get files that need processing
*/
async getFilesNeedingProcessing(bookId: string): Promise<FileSystemItem[]> {
const { data: files, error } = await this.supabase
.from('file_system_items')
.select(`
*,
embedding_status:file_embedding_status(status, last_content_hash)
`)
.eq('book_id', bookId)
.eq('type', 'file')
.not('content', 'is', null)
if (error) throw error
return files?.filter((file: any) => {
const status = file.embedding_status?.[0]
const currentHash = this.calculateHash(file.content || '')
return !status ||
status.status === 'pending' ||
status.status === 'failed' ||
status.status === 'stale' ||
status.last_content_hash !== currentHash
}) || []
}
/**
* Split file content into chunks
*/
private splitIntoChunks(content: string, fileId: string, bookId: string): FileChunk[] {
const chunks: FileChunk[] = []
const lines = content.split('\n')
// Configuration for chunking
const CHUNK_SIZE = 1000 // characters
const CHUNK_OVERLAP = 200 // characters overlap between chunks
const MIN_CHUNK_SIZE = 100 // minimum chunk size
let currentChunk = ''
let currentStartLine = 0
let currentCharStart = 0
let chunkIndex = 0
for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
const line = lines[lineIndex]
const lineWithNewline = line + (lineIndex < lines.length - 1 ? '\n' : '')
// Check if adding this line would exceed chunk size
if (currentChunk.length + lineWithNewline.length > CHUNK_SIZE && currentChunk.length > MIN_CHUNK_SIZE) {
// Create chunk
const chunk: FileChunk = {
file_id: fileId,
book_id: bookId,
chunk_index: chunkIndex,
line_start: currentStartLine + 1, // 1-based line numbers
line_end: lineIndex, // Current line (exclusive)
char_start: currentCharStart,
char_end: currentCharStart + currentChunk.length,
content: currentChunk.trim(),
content_hash: this.calculateHash(currentChunk.trim()),
embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004'
}
chunks.push(chunk)
chunkIndex++
// Start new chunk with overlap
const overlapStart = Math.max(0, currentChunk.length - CHUNK_OVERLAP)
const overlapText = currentChunk.substring(overlapStart)
currentChunk = overlapText + lineWithNewline
currentStartLine = lineIndex
currentCharStart = currentCharStart + overlapStart
} else {
currentChunk += lineWithNewline
}
}
// Add final chunk if there's remaining content
if (currentChunk.trim().length > MIN_CHUNK_SIZE) {
const chunk: FileChunk = {
file_id: fileId,
book_id: bookId,
chunk_index: chunkIndex,
line_start: currentStartLine + 1,
line_end: lines.length,
char_start: currentCharStart,
char_end: currentCharStart + currentChunk.length,
content: currentChunk.trim(),
content_hash: this.calculateHash(currentChunk.trim()),
embedding_model: OPENAI_API_KEY ? 'text-embedding-3-small' : 'text-embedding-004'
}
chunks.push(chunk)
}
return chunks
}
/**
* Generate embeddings for chunks
*/
private async generateEmbeddings(chunks: FileChunk[]): Promise<FileChunk[]> {
const batchSize = 10 // Process in batches to avoid API limits
const chunksWithEmbeddings: FileChunk[] = []
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
const texts = batch.map(chunk => chunk.content)
try {
const embeddings = await this.generateBatchEmbeddings(texts)
batch.forEach((chunk, index) => {
chunk.embedding = embeddings[index]
chunksWithEmbeddings.push(chunk)
})
// Rate limiting
if (i + batchSize < chunks.length) {
await new Promise(resolve => setTimeout(resolve, 100))
}
} catch (error) {
console.error(`Failed to generate embeddings for batch ${i}:`, error)
// Don't add chunks without embeddings - this will cause the file processing to fail
throw error
}
}
return chunksWithEmbeddings
}
/**
* Generate embeddings using OpenAI with fallback to Google Gemini
*/
private async generateBatchEmbeddings(texts: string[]): Promise<number[][]> {
// Try OpenAI first
if (OPENAI_API_KEY) {
try {
const response = await fetch(`${OPENAI_URL}/embeddings`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${OPENAI_API_KEY}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'text-embedding-3-small',
input: texts,
encoding_format: 'float'
})
})
if (response.ok) {
const data = await response.json()
return data.data.map((item: any) => item.embedding)
} else {
console.log(`OpenAI API error: ${response.status}, falling back to Google Gemini`)
}
} catch (error) {
console.log('OpenAI API failed, falling back to Google Gemini:', error)
}
}
// Fallback to Google Gemini embeddings
if (GOOGLE_API_KEY) {
try {
const embeddings: number[][] = []
// Google API processes one text at a time
for (const text of texts) {
const response = await fetch(`${GOOGLE_EMBEDDING_URL}?key=${GOOGLE_API_KEY}`, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'models/text-embedding-004',
content: {
parts: [{ text }]
}
})
})
if (!response.ok) {
throw new Error(`Google API error: ${response.status} ${response.statusText}`)
}
const data = await response.json()
embeddings.push(data.embedding.values)
// Rate limiting for Google API
await new Promise(resolve => setTimeout(resolve, 100))
}
return embeddings
} catch (error) {
console.error('Google Gemini API failed:', error)
}
}
throw new Error('No embedding provider available. Please check your API keys for OpenAI or Google.')
}
/**
* Generate single embedding for search queries
*/
private async generateSingleEmbedding(text: string): Promise<number[]> {
const embeddings = await this.generateBatchEmbeddings([text])
return embeddings[0]
}
/**
* Store chunks in database
*/
private async storeChunks(chunks: FileChunk[], fileId: string): Promise<void> {
// Delete existing chunks for this file
const { error: deleteError } = await this.supabase
.from('file_chunks')
.delete()
.eq('file_id', fileId)
if (deleteError) {
console.error('Failed to delete existing chunks:', deleteError)
throw deleteError
}
// Filter out chunks without embeddings
const validChunks = chunks.filter(chunk => chunk.embedding && chunk.embedding.length > 0)
if (validChunks.length === 0) {
console.warn(`No valid chunks with embeddings to store for file ${fileId}`)
return
}
console.log(`Storing ${validChunks.length} chunks with embeddings for file ${fileId}`)
// Insert new chunks
const { error } = await this.supabase
.from('file_chunks')
.insert(validChunks.map(chunk => ({
file_id: chunk.file_id,
book_id: chunk.book_id,
chunk_index: chunk.chunk_index,
line_start: chunk.line_start,
line_end: chunk.line_end,
char_start: chunk.char_start,
char_end: chunk.char_end,
content: chunk.content,
content_hash: chunk.content_hash,
embedding: chunk.embedding,
embedding_model: chunk.embedding_model
})))
if (error) {
console.error('Failed to insert chunks:', error)
throw error
}
console.log(`โ
Successfully stored ${validChunks.length} chunks in database`)
}
/**
* Check if file needs processing
*/
private async needsProcessing(fileId: string, contentHash: string): Promise<boolean> {
const { data: status } = await this.supabase
.from('file_embedding_status')
.select('status, last_content_hash')
.eq('file_id', fileId)
.single()
if (!status) return true
return status.status !== 'completed' || status.last_content_hash !== contentHash
}
/**
* Update file processing status
*/
private async updateFileStatus(
fileId: string,
bookId: string,
status: string,
contentHash?: string,
errorMessage?: string
): Promise<void> {
const updateData: any = {
status,
updated_at: new Date().toISOString()
}
if (status === 'completed') {
updateData.last_processed_at = new Date().toISOString()
}
if (contentHash) {
updateData.last_content_hash = contentHash
}
if (errorMessage) {
updateData.error_message = errorMessage
}
console.log(`Updating file status: ${fileId} -> ${status}`)
const { error } = await this.supabase
.from('file_embedding_status')
.upsert(
{ file_id: fileId, book_id: bookId, ...updateData },
{ onConflict: 'file_id' }
)
if (error) {
console.error('Failed to update file status:', error)
throw error
} else {
console.log(`โ
Updated file status: ${fileId} -> ${status}`)
}
}
/**
* Calculate SHA-256 hash of content
*/
private calculateHash(content: string): string {
return crypto.createHash('sha256').update(content).digest('hex')
}
}
// Export singleton instance
export const semanticIndexingService = new SemanticIndexingService()