PAC-tree / model / test_pando.py
test_pando.py
Raw
#!/usr/bin/env python3
"""
test_pando.py - Compare the performance of Pando and MTO layouts under TPC-H workload

This script compares the following metrics:
1. Data scan ratio (scanned data volume / total data volume)
2. Number of distribution trees created by layouts
3. Number of blocks instantiated in each table
4. Block tuple utilization rate (number of tuples / max block size)
"""

import os
import sys
import pickle
import numpy as np
import pandas as pd
base_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(base_dir, ".."))

from model.partition_algorithm import PartitionAlgorithm


class PandoMTOComparator:
    def __init__(self, benchmark="tpch", scale_factor=1):
        self.benchmark = benchmark
        self.scale_factor = scale_factor
        self.queries = []
        self.tables = {}

    def build_mto_layout(self, table_name, block_size=10000):
        """Build MTO layout - using standard process of PartitionAlgorithm"""
        print(f"Building MTO layout for {table_name}...")
        pa = PartitionAlgorithm(
            block_size=block_size, benchmark=self.benchmark, table_name=table_name
        )
        pa.load_data()
        pa.load_query(join_indeuced="MTO")

        pa.used_columns = list(range(len(pa.used_columns))) 
        pa.InitializeWithQDT()
        return pa

    def build_pando_layout(self, table_name, max_depth=10, block_size=10000):
        print(f"Building Pando layout for {table_name}...")
        pa = PartitionAlgorithm(
            block_size=block_size, benchmark=self.benchmark, table_name=table_name
        )
        pa.load_data()
        pa.load_query(join_indeuced="MTO")
        pa.InitializeWithPando(max_depth=max_depth)

        return pa

    def calculate_scan_ratio(self, pa, queries, table_name):
        """计算scan ratio"""
        if not queries:
            return 0

        total_rows = len(pa.tabledata)
        total_scanned = 0

        for query in queries:
            if hasattr(pa, "partition_tree"):
                try:
                    blocks = pa.partition_tree.query_single(query)
                    scanned_rows = len(blocks) * pa.block_size
                except:
                    scanned_rows = total_rows
            else:
                try:
                    blocks = pa.query_pando(query)
                    avg_block_size = 1000  # 默认块大小
                    scanned_rows = len(blocks) * avg_block_size
                except:
                    scanned_rows = total_rows

            total_scanned += min(scanned_rows, total_rows)

        return total_scanned / (total_rows * len(queries)) if queries else 0

    def count_blocks(self, pa):
        """Count the number of blocks"""
        try:
            if hasattr(pa, "partition_tree") and pa.partition_tree:
                return len(pa.partition_tree.get_leaves())
            elif hasattr(pa, "pando_trees") and pa.pando_trees:
                total_leafs = 1
                for tree in pa.pando_trees:
                    total_leafs *= len(tree.get_leaves())
                return total_leafs
            else:
                return 1
        except:
            return 1

    def calculate_block_utilization(self, pa):
        """Calculate block utilization"""
        if hasattr(pa, "partition_tree") and pa.partition_tree:
            total_leafs = pa.partition_tree.get_leaves()
            max_tuples = (
                len(total_leafs) * pa.block_size * 2
            ) 
            total_tuples = sum([leaf.node_size for leaf in total_leafs])
            return total_tuples / max_tuples if max_tuples > 0 else 0
        elif hasattr(pa, "pando_trees") and pa.pando_trees:
            total_tuples, max_tuples = 0, 0
            total_leafs = 1
            for tree in pa.pando_trees:
                total_leafs *= len(tree.get_leaves())
            max_tuples = total_leafs * pa.block_size * 2
            total_tuples = tree.pt_root.node_size
            return total_tuples / max_tuples if max_tuples > 0 else 0
        else:
            return 0

    def run_comparison(self):
        """Run complete comparison experiment - reuse standard process of PartitionAlgorithm"""
        print("=== TPC-H Pando vs MTO Layout Comparison ===")
        tpch_tables = [
            "lineitem",
            "orders",
            "supplier",
            "customer",
            "nation",
            "region",
            "part",
            "partsupp",
        ]
        results = []
        for table_name in tpch_tables[:1]:
            print(f"\n--- Analyzing {table_name} ---")
            try:
                mto_pa = self.build_mto_layout(table_name)
                pando_pa = self.build_pando_layout(table_name)
                mto_scan_ratio = mto_pa.evaluate_single_table_access_cost()
                pando_scan_ratio = pando_pa.evaluate_pando_cost()
                mto_tree_count = 1 
                pando_tree_count = len(pando_pa.pando_trees)
                mto_blocks = self.count_blocks(mto_pa)
                pando_blocks = self.count_blocks(pando_pa)
                mto_utilization = self.calculate_block_utilization(mto_pa)
                pando_utilization = self.calculate_block_utilization(pando_pa)
                results.append(
                    {
                        "table": table_name,
                        "mto_scan_ratio": mto_scan_ratio,
                        "pando_scan_ratio": pando_scan_ratio,
                        "mto_trees": mto_tree_count,
                        "pando_trees": pando_tree_count,
                        "mto_blocks": mto_blocks,
                        "pando_blocks": pando_blocks,
                        "mto_utilization": mto_utilization,
                        "pando_utilization": pando_utilization,
                    }
                )

                print(
                    f"  Scan Ratio - MTO: {mto_scan_ratio:.4f}, Pando: {pando_scan_ratio:.4f}"
                )
                print(
                    f"  Tree Count - MTO: {mto_tree_count}, Pando: {pando_tree_count}"
                )
                print(f"  Block Count - MTO: {mto_blocks}, Pando: {pando_blocks}")
                print(
                    f"  Utilization - MTO: {mto_utilization:.4f}, Pando: {pando_utilization:.4f}"
                )

            except Exception as e:
                print(f"  Error processing {table_name}: {e}")
                raise e

        if not results:
            print("No results generated.")

        df = pd.DataFrame(results)
        self.print_summary(df)

        return df


    def print_summary(self, df):
        print("\n=== Summary Statistics ===")
        print(f"Total tables analyzed: {len(df)}")
        if len(df) > 0:
            print(f"Average scan ratio - MTO: {df['mto_scan_ratio'].mean():.4f}")
            print(f"Average scan ratio - Pando: {df['pando_scan_ratio'].mean():.4f}")
            print(f"Average tree count - MTO: {df['mto_trees'].mean():.2f}")
            print(f"Average tree count - Pando: {df['pando_trees'].mean():.2f}")
            print(f"Average block count - MTO: {df['mto_blocks'].mean():.2f}")
            print(f"Average block count - Pando: {df['pando_blocks'].mean():.2f}")
            print(f"Average utilization - MTO: {df['mto_utilization'].mean():.4f}")
            print(f"Average utilization - Pando: {df['pando_utilization'].mean():.4f}")


def main():
    comparator = PandoMTOComparator(benchmark="tpch", scale_factor=1)
    comparator.run_comparison()

    # # try:
    # #     import matplotlib.pyplot as plt

    # #     fig, axes = plt.subplots(2, 2, figsize=(12, 10))

    # #     # Scan Ratio对比
    # #     axes[0, 0].bar(
    # #         results["table"], results["mto_scan_ratio"], alpha=0.7, label="MTO"
    # #     )
    # #     axes[0, 0].bar(
    # #         results["table"], results["pando_scan_ratio"], alpha=0.7, label="Pando"
    # #     )
    # #     axes[0, 0].set_title("Scan Ratio Comparison")
    # #     axes[0, 0].set_ylabel("Scan Ratio")
    # #     axes[0, 0].legend()
    # #     axes[0, 0].tick_params(axis="x", rotation=45)

    # #     # Block Count对比
    # #     axes[0, 1].bar(results["table"], results["mto_blocks"], alpha=0.7, label="MTO")
    # #     axes[0, 1].bar(
    # #         results["table"], results["pando_blocks"], alpha=0.7, label="Pando"
    # #     )
    # #     axes[0, 1].set_title("Block Count Comparison")
    # #     axes[0, 1].set_ylabel("Block Count")
    # #     axes[0, 1].legend()
    # #     axes[0, 1].tick_params(axis="x", rotation=45)

    # #     # Utilization对比
    # #     axes[1, 0].bar(
    # #         results["table"], results["mto_utilization"], alpha=0.7, label="MTO"
    # #     )
    # #     axes[1, 0].bar(
    # #         results["table"], results["pando_utilization"], alpha=0.7, label="Pando"
    # #     )
    # #     axes[1, 0].set_title("Block Utilization Comparison")
    # #     axes[1, 0].set_ylabel("Utilization")
    # #     axes[1, 0].legend()
    # #     axes[1, 0].tick_params(axis="x", rotation=45)

    # #     # Tree Count对比
    # #     axes[1, 1].bar(results["table"], results["mto_trees"], alpha=0.7, label="MTO")
    # #     axes[1, 1].bar(
    # #         results["table"], results["pando_trees"], alpha=0.7, label="Pando"
    # #     )
    # #     axes[1, 1].set_title("Tree Count Comparison")
    # #     axes[1, 1].set_ylabel("Tree Count")
    # #     axes[1, 1].legend()
    # #     axes[1, 1].tick_params(axis="x", rotation=45)

    # #     plt.tight_layout()
    # #     plt.savefig(os.path.join(base_dir, "pando_mto_comparison.png"))
    # #     plt.close()
    # #     print("Visualization saved to pando_mto_comparison.png")

    # except ImportError:
    #     print("matplotlib not available, skipping visualization")


if __name__ == "__main__":
    main()