-- Semantic Indexing and Vector Search for BookWiz
-- Implements Cursor-style semantic code search using embeddings
-- Table for storing file chunks with their embeddings
CREATE TABLE file_chunks (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_id UUID REFERENCES file_system_items(id) ON DELETE CASCADE NOT NULL,
book_id UUID REFERENCES books(id) ON DELETE CASCADE NOT NULL,
-- Chunk metadata
chunk_index INTEGER NOT NULL, -- 0-based index within the file
line_start INTEGER NOT NULL,
line_end INTEGER NOT NULL,
char_start INTEGER NOT NULL,
char_end INTEGER NOT NULL,
-- Content and hash for caching
content TEXT NOT NULL,
content_hash TEXT NOT NULL, -- SHA-256 hash for change detection
-- Embedding data
embedding vector(1536), -- OpenAI text-embedding-3-small dimension
embedding_model TEXT NOT NULL DEFAULT 'text-embedding-3-small',
-- Timestamps
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
-- Unique constraint to prevent duplicate chunks
UNIQUE(file_id, chunk_index)
);
-- Table for tracking file processing status
CREATE TABLE file_embedding_status (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_id UUID REFERENCES file_system_items(id) ON DELETE CASCADE NOT NULL UNIQUE,
book_id UUID REFERENCES books(id) ON DELETE CASCADE NOT NULL,
-- Processing status
status TEXT NOT NULL CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'stale')) DEFAULT 'pending',
last_processed_at TIMESTAMP WITH TIME ZONE,
last_content_hash TEXT, -- Hash of entire file content when last processed
-- Error tracking
error_message TEXT,
retry_count INTEGER DEFAULT 0,
-- Timestamps
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- Indexes for performance
CREATE INDEX idx_file_chunks_file_id ON file_chunks(file_id);
CREATE INDEX idx_file_chunks_book_id ON file_chunks(book_id);
CREATE INDEX idx_file_chunks_content_hash ON file_chunks(content_hash);
CREATE INDEX idx_file_chunks_lines ON file_chunks(file_id, line_start, line_end);
-- Vector similarity search index (requires pgvector extension)
-- Note: This may need to be added manually if pgvector isn't available
-- CREATE INDEX idx_file_chunks_embedding ON file_chunks USING ivfflat (embedding vector_cosine_ops);
CREATE INDEX idx_file_embedding_status_book_id ON file_embedding_status(book_id);
CREATE INDEX idx_file_embedding_status_status ON file_embedding_status(status);
CREATE INDEX idx_file_embedding_status_file_id ON file_embedding_status(file_id);
-- Triggers for updated_at
CREATE TRIGGER update_file_chunks_updated_at
BEFORE UPDATE ON file_chunks
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER update_file_embedding_status_updated_at
BEFORE UPDATE ON file_embedding_status
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- Function to calculate content hash
CREATE OR REPLACE FUNCTION calculate_content_hash(content TEXT)
RETURNS TEXT AS $$
BEGIN
RETURN encode(digest(content, 'sha256'), 'hex');
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Function to mark file for re-embedding when content changes
CREATE OR REPLACE FUNCTION mark_file_for_reembedding()
RETURNS TRIGGER AS $$
BEGIN
-- Only process file types (not folders)
IF NEW.type = 'file' AND NEW.content IS NOT NULL THEN
-- Calculate hash of new content
DECLARE
new_hash TEXT := calculate_content_hash(NEW.content);
BEGIN
-- Update or insert embedding status
INSERT INTO file_embedding_status (file_id, book_id, status, last_content_hash)
VALUES (NEW.id, NEW.book_id, 'pending', new_hash)
ON CONFLICT (file_id) DO UPDATE SET
status = CASE
WHEN file_embedding_status.last_content_hash = new_hash THEN file_embedding_status.status
ELSE 'stale'
END,
last_content_hash = new_hash,
updated_at = NOW();
END;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Trigger to automatically mark files for re-embedding
CREATE TRIGGER mark_file_for_reembedding_trigger
AFTER INSERT OR UPDATE OF content ON file_system_items
FOR EACH ROW
EXECUTE FUNCTION mark_file_for_reembedding();
-- Function for semantic search using vector similarity
CREATE OR REPLACE FUNCTION semantic_search(
search_book_id UUID,
query_embedding vector(1536),
similarity_threshold FLOAT DEFAULT 0.7,
max_results INTEGER DEFAULT 20
)
RETURNS TABLE(
chunk_id UUID,
file_id UUID,
file_name TEXT,
content TEXT,
similarity FLOAT,
line_start INTEGER,
line_end INTEGER
) AS $$
BEGIN
RETURN QUERY
SELECT
fc.id as chunk_id,
fc.file_id,
fsi.name as file_name,
fc.content,
(1 - (fc.embedding <=> query_embedding)) as similarity,
fc.line_start,
fc.line_end
FROM file_chunks fc
JOIN file_system_items fsi ON fc.file_id = fsi.id
WHERE fc.book_id = search_book_id
AND (1 - (fc.embedding <=> query_embedding)) > similarity_threshold
ORDER BY fc.embedding <=> query_embedding
LIMIT max_results;
END;
$$ LANGUAGE plpgsql;
-- Row Level Security
ALTER TABLE file_chunks ENABLE ROW LEVEL SECURITY;
ALTER TABLE file_embedding_status ENABLE ROW LEVEL SECURITY;
-- Policies for file_chunks
CREATE POLICY "Users can view chunks for their books" ON file_chunks
FOR SELECT USING (
EXISTS (
SELECT 1 FROM books
WHERE books.id = file_chunks.book_id
AND books.user_id = auth.uid()
)
);
CREATE POLICY "Users can manage chunks for their books" ON file_chunks
FOR ALL USING (
EXISTS (
SELECT 1 FROM books
WHERE books.id = file_chunks.book_id
AND books.user_id = auth.uid()
)
);
-- Policies for file_embedding_status
CREATE POLICY "Users can view embedding status for their books" ON file_embedding_status
FOR SELECT USING (
EXISTS (
SELECT 1 FROM books
WHERE books.id = file_embedding_status.book_id
AND books.user_id = auth.uid()
)
);
CREATE POLICY "Users can manage embedding status for their books" ON file_embedding_status
FOR ALL USING (
EXISTS (
SELECT 1 FROM books
WHERE books.id = file_embedding_status.book_id
AND books.user_id = auth.uid()
)
);
-- Add realtime subscriptions
ALTER PUBLICATION supabase_realtime ADD TABLE file_chunks;
ALTER PUBLICATION supabase_realtime ADD TABLE file_embedding_status;