bookwiz.io / supabase / migrations / 048_fix_word_count_final.sql
048_fix_word_count_final.sql
Raw
-- Fix word count calculation issues
-- This migration addresses multiple problems with word counting

-- 1. Fix calculate_book_word_count function to use correct column names and better file filtering
CREATE OR REPLACE FUNCTION calculate_book_word_count(book_uuid UUID)
RETURNS INTEGER 
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    total_words INTEGER := 0;
    item_record RECORD;
BEGIN
    -- Sum word counts from all text files in the book
    -- Use 'type' column (not 'item_type') and include all text file types
    FOR item_record IN 
        SELECT content 
        FROM file_system_items 
        WHERE book_id = book_uuid 
        AND type = 'file'
        AND (
            -- Include common text file extensions
            file_extension IN ('txt', 'md', 'markdown', 'doc', 'docx', 'rtf', 'odt', 'html', 'htm', 'xml', 'json', 'csv', 'tex', 'rst', 'asciidoc', 'adoc')
            OR 
            -- Include files with text mime types
            mime_type LIKE 'text/%'
            OR
            -- Include files without extension but with content (fallback)
            (file_extension IS NULL AND content IS NOT NULL AND content != '')
        )
        AND content IS NOT NULL
        AND content != ''
    LOOP
        -- Improved word count: split by whitespace and count non-empty elements
        -- Handle multiple whitespace characters and newlines properly
        total_words := total_words + (
            SELECT array_length(
                string_to_array(
                    regexp_replace(item_record.content, '\s+', ' ', 'g'), 
                    ' '
                ), 
                1
            )
        );
    END LOOP;
    
    RETURN COALESCE(total_words, 0);
END;
$$ LANGUAGE plpgsql;

-- 2. Fix update_book_word_count function to use correct column names
CREATE OR REPLACE FUNCTION update_book_word_count()
RETURNS TRIGGER 
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    book_uuid UUID;
    new_word_count INTEGER;
BEGIN
    -- Get the book_id for the file that was modified
    IF TG_OP = 'DELETE' THEN
        book_uuid := OLD.book_id;
    ELSE
        book_uuid := NEW.book_id;
    END IF;
    
    -- Only process if this is a text file
    -- Use 'type' column (not 'item_type') and include all text file types
    IF (TG_OP = 'DELETE' AND OLD.type = 'file' AND (
        OLD.file_extension IN ('txt', 'md', 'markdown', 'doc', 'docx', 'rtf', 'odt', 'html', 'htm', 'xml', 'json', 'csv', 'tex', 'rst', 'asciidoc', 'adoc')
        OR OLD.mime_type LIKE 'text/%'
        OR (OLD.file_extension IS NULL AND OLD.content IS NOT NULL AND OLD.content != '')
    )) OR
       (TG_OP IN ('INSERT', 'UPDATE') AND NEW.type = 'file' AND (
        NEW.file_extension IN ('txt', 'md', 'markdown', 'doc', 'docx', 'rtf', 'odt', 'html', 'htm', 'xml', 'json', 'csv', 'tex', 'rst', 'asciidoc', 'adoc')
        OR NEW.mime_type LIKE 'text/%'
        OR (NEW.file_extension IS NULL AND NEW.content IS NOT NULL AND NEW.content != '')
    )) THEN
        
        -- Calculate new word count for the book
        new_word_count := calculate_book_word_count(book_uuid);
        
        -- Update the book's word count
        UPDATE books 
        SET word_count = new_word_count, 
            updated_at = NOW()
        WHERE id = book_uuid;
    END IF;
    
    -- Return appropriate record based on operation
    IF TG_OP = 'DELETE' THEN
        RETURN OLD;
    ELSE
        RETURN NEW;
    END IF;
END;
$$ LANGUAGE plpgsql;

-- 3. Create a function to manually recalculate word counts for all books
CREATE OR REPLACE FUNCTION recalculate_all_book_word_counts()
RETURNS TABLE(book_id UUID, book_title TEXT, old_word_count INTEGER, new_word_count INTEGER)
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    book_record RECORD;
    new_count INTEGER;
BEGIN
    -- Loop through all books and recalculate their word counts
    FOR book_record IN 
        SELECT id, title, word_count 
        FROM books 
        ORDER BY updated_at DESC
    LOOP
        -- Calculate new word count
        new_count := calculate_book_word_count(book_record.id);
        
        -- Update the book's word count
        UPDATE books 
        SET word_count = new_count, 
            updated_at = NOW()
        WHERE id = book_record.id;
        
        -- Return the change information
        book_id := book_record.id;
        book_title := book_record.title;
        old_word_count := book_record.word_count;
        new_word_count := new_count;
        
        RETURN NEXT;
    END LOOP;
    
    RETURN;
END;
$$ LANGUAGE plpgsql;

-- 4. Create a function to recalculate word count for a specific book
CREATE OR REPLACE FUNCTION recalculate_book_word_count(book_uuid UUID)
RETURNS INTEGER
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    new_word_count INTEGER;
BEGIN
    -- Calculate new word count
    new_word_count := calculate_book_word_count(book_uuid);
    
    -- Update the book's word count
    UPDATE books 
    SET word_count = new_word_count, 
        updated_at = NOW()
    WHERE id = book_uuid;
    
    RETURN new_word_count;
END;
$$ LANGUAGE plpgsql;

-- 5. Add comments for documentation
COMMENT ON FUNCTION calculate_book_word_count(UUID) IS 'Calculates word count for a book from all text files - includes various text file types and handles edge cases';
COMMENT ON FUNCTION update_book_word_count() IS 'Updates book word count when files change - triggers automatically on file operations';
COMMENT ON FUNCTION recalculate_all_book_word_counts() IS 'Manually recalculates word counts for all books - useful for fixing existing data';
COMMENT ON FUNCTION recalculate_book_word_count(UUID) IS 'Manually recalculates word count for a specific book - useful for fixing individual books';

-- 6. Ensure the trigger exists and is working correctly
-- Drop and recreate the trigger to ensure it uses the latest function
DROP TRIGGER IF EXISTS update_word_count_trigger ON file_system_items;

CREATE TRIGGER update_word_count_trigger
    AFTER INSERT OR UPDATE OR DELETE ON file_system_items
    FOR EACH ROW
    EXECUTE FUNCTION update_book_word_count();

-- 7. Create a table to track books that need word count recalculation
CREATE TABLE IF NOT EXISTS book_word_count_queue (
    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
    book_id UUID REFERENCES books(id) ON DELETE CASCADE NOT NULL,
    queued_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    processed_at TIMESTAMP WITH TIME ZONE,
    attempts INTEGER DEFAULT 0,
    last_error TEXT,
    
    -- Prevent duplicate entries for the same book
    UNIQUE(book_id)
);

-- Create index for performance
CREATE INDEX IF NOT EXISTS idx_book_word_count_queue_book_id ON book_word_count_queue(book_id);
CREATE INDEX IF NOT EXISTS idx_book_word_count_queue_processed_at ON book_word_count_queue(processed_at);
CREATE INDEX IF NOT EXISTS idx_book_word_count_queue_queued_at ON book_word_count_queue(queued_at);

-- Enable RLS
ALTER TABLE book_word_count_queue ENABLE ROW LEVEL SECURITY;

-- RLS policies - only allow system access for now
CREATE POLICY "System access only" ON book_word_count_queue
    FOR ALL USING (false);

-- 8. Function to queue a book for word count recalculation
CREATE OR REPLACE FUNCTION queue_book_for_word_count_recalc(book_uuid UUID)
RETURNS VOID
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
BEGIN
    -- Insert or update the queue entry
    INSERT INTO book_word_count_queue (book_id, queued_at, processed_at, attempts, last_error)
    VALUES (book_uuid, NOW(), NULL, 0, NULL)
    ON CONFLICT (book_id) DO UPDATE SET
        queued_at = NOW(),
        processed_at = NULL,
        attempts = 0,
        last_error = NULL;
END;
$$ LANGUAGE plpgsql;

-- 9. Function to process queued word count recalculations
CREATE OR REPLACE FUNCTION process_word_count_queue(batch_size INTEGER DEFAULT 10)
RETURNS TABLE(book_id UUID, old_count INTEGER, new_count INTEGER, success BOOLEAN, error_message TEXT)
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    queue_record RECORD;
    old_word_count INTEGER;
    new_word_count INTEGER;
    error_text TEXT;
BEGIN
    -- Process books that need recalculation
    FOR queue_record IN 
        SELECT q.book_id, b.title, b.word_count
        FROM book_word_count_queue q
        JOIN books b ON q.book_id = b.id
        WHERE q.processed_at IS NULL
        AND q.attempts < 3  -- Limit retry attempts
        ORDER BY q.queued_at ASC
        LIMIT batch_size
    LOOP
        BEGIN
            old_word_count := queue_record.word_count;
            
            -- Calculate new word count
            new_word_count := calculate_book_word_count(queue_record.book_id);
            
            -- Update the book's word count
            UPDATE books 
            SET word_count = new_word_count, 
                updated_at = NOW()
            WHERE id = queue_record.book_id;
            
            -- Mark as processed
            UPDATE book_word_count_queue
            SET processed_at = NOW(),
                last_error = NULL
            WHERE book_id = queue_record.book_id;
            
            -- Return success result
            book_id := queue_record.book_id;
            old_count := old_word_count;
            new_count := new_word_count;
            success := TRUE;
            error_message := NULL;
            
            RETURN NEXT;
            
        EXCEPTION WHEN OTHERS THEN
            -- Handle errors
            error_text := SQLERRM;
            
            -- Update attempt count and error
            UPDATE book_word_count_queue
            SET attempts = attempts + 1,
                last_error = error_text
            WHERE book_id = queue_record.book_id;
            
            -- Return error result
            book_id := queue_record.book_id;
            old_count := queue_record.word_count;
            new_count := NULL;
            success := FALSE;
            error_message := error_text;
            
            RETURN NEXT;
        END;
    END LOOP;
    
    RETURN;
END;
$$ LANGUAGE plpgsql;

-- 10. Enhanced trigger function that also queues books for background recalculation
CREATE OR REPLACE FUNCTION update_book_word_count_with_queue()
RETURNS TRIGGER 
SECURITY DEFINER
SET search_path = public, pg_catalog, pg_temp
AS $$
DECLARE
    book_uuid UUID;
    new_word_count INTEGER;
    should_recalc BOOLEAN := FALSE;
BEGIN
    -- Get the book_id for the file that was modified
    IF TG_OP = 'DELETE' THEN
        book_uuid := OLD.book_id;
    ELSE
        book_uuid := NEW.book_id;
    END IF;
    
    -- Check if this is a text file that should trigger recalculation
    IF (TG_OP = 'DELETE' AND OLD.type = 'file' AND (
        OLD.file_extension IN ('txt', 'md', 'markdown', 'doc', 'docx', 'rtf', 'odt', 'html', 'htm', 'xml', 'json', 'csv', 'tex', 'rst', 'asciidoc', 'adoc')
        OR OLD.mime_type LIKE 'text/%'
        OR (OLD.file_extension IS NULL AND OLD.content IS NOT NULL AND OLD.content != '')
    )) OR
       (TG_OP IN ('INSERT', 'UPDATE') AND NEW.type = 'file' AND (
        NEW.file_extension IN ('txt', 'md', 'markdown', 'doc', 'docx', 'rtf', 'odt', 'html', 'htm', 'xml', 'json', 'csv', 'tex', 'rst', 'asciidoc', 'adoc')
        OR NEW.mime_type LIKE 'text/%'
        OR (NEW.file_extension IS NULL AND NEW.content IS NOT NULL AND NEW.content != '')
    )) THEN
        should_recalc := TRUE;
    END IF;
    
    -- If we need to recalculate, do immediate calculation and also queue for background processing
    IF should_recalc THEN
        -- Try immediate calculation (for real-time updates)
        BEGIN
            new_word_count := calculate_book_word_count(book_uuid);
            
            UPDATE books 
            SET word_count = new_word_count, 
                updated_at = NOW()
            WHERE id = book_uuid;
            
        EXCEPTION WHEN OTHERS THEN
            -- If immediate calculation fails, just queue it for background processing
            NULL;
        END;
        
        -- Always queue for background processing as well (for reliability)
        PERFORM queue_book_for_word_count_recalc(book_uuid);
    END IF;
    
    -- Return appropriate record based on operation
    IF TG_OP = 'DELETE' THEN
        RETURN OLD;
    ELSE
        RETURN NEW;
    END IF;
END;
$$ LANGUAGE plpgsql;

-- 11. Replace the existing trigger with the enhanced version
DROP TRIGGER IF EXISTS update_word_count_trigger ON file_system_items;

CREATE TRIGGER update_word_count_trigger
    AFTER INSERT OR UPDATE OR DELETE ON file_system_items
    FOR EACH ROW
    EXECUTE FUNCTION update_book_word_count_with_queue();

-- 12. Queue all existing books for word count recalculation
INSERT INTO book_word_count_queue (book_id, queued_at, processed_at, attempts, last_error)
SELECT id, NOW(), NULL, 0, NULL
FROM books
ON CONFLICT (book_id) DO UPDATE SET
    queued_at = NOW(),
    processed_at = NULL,
    attempts = 0,
    last_error = NULL;

-- 13. Add comments for new functions
COMMENT ON FUNCTION queue_book_for_word_count_recalc(UUID) IS 'Queues a book for background word count recalculation';
COMMENT ON FUNCTION process_word_count_queue(INTEGER) IS 'Processes queued word count recalculations in batches';
COMMENT ON FUNCTION update_book_word_count_with_queue() IS 'Enhanced trigger that does immediate recalc and queues for background processing';
COMMENT ON TABLE book_word_count_queue IS 'Queue for books that need word count recalculation for reliability and batch processing';