#!/usr/bin/env python3 """ test_pando.py - Compare the performance of Pando and MTO layouts under TPC-H workload This script compares the following metrics: 1. Data scan ratio (scanned data volume / total data volume) 2. Number of distribution trees created by layouts 3. Number of blocks instantiated in each table 4. Block tuple utilization rate (number of tuples / max block size) """ import os import sys import pickle import numpy as np import pandas as pd base_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(base_dir, "..")) from model.partition_algorithm import PartitionAlgorithm class PandoMTOComparator: def __init__(self, benchmark="tpch", scale_factor=1): self.benchmark = benchmark self.scale_factor = scale_factor self.queries = [] self.tables = {} def build_mto_layout(self, table_name, block_size=10000): """Build MTO layout - using standard process of PartitionAlgorithm""" print(f"Building MTO layout for {table_name}...") pa = PartitionAlgorithm( block_size=block_size, benchmark=self.benchmark, table_name=table_name ) pa.load_data() pa.load_query(join_indeuced="MTO") pa.used_columns = list(range(len(pa.used_columns))) pa.InitializeWithQDT() return pa def build_pando_layout(self, table_name, max_depth=10, block_size=10000): print(f"Building Pando layout for {table_name}...") pa = PartitionAlgorithm( block_size=block_size, benchmark=self.benchmark, table_name=table_name ) pa.load_data() pa.load_query(join_indeuced="MTO") pa.InitializeWithPando(max_depth=max_depth) return pa def calculate_scan_ratio(self, pa, queries, table_name): """计算scan ratio""" if not queries: return 0 total_rows = len(pa.tabledata) total_scanned = 0 for query in queries: if hasattr(pa, "partition_tree"): try: blocks = pa.partition_tree.query_single(query) scanned_rows = len(blocks) * pa.block_size except: scanned_rows = total_rows else: try: blocks = pa.query_pando(query) avg_block_size = 1000 # 默认块大小 scanned_rows = len(blocks) * avg_block_size except: scanned_rows = total_rows total_scanned += min(scanned_rows, total_rows) return total_scanned / (total_rows * len(queries)) if queries else 0 def count_blocks(self, pa): """Count the number of blocks""" try: if hasattr(pa, "partition_tree") and pa.partition_tree: return len(pa.partition_tree.get_leaves()) elif hasattr(pa, "pando_trees") and pa.pando_trees: total_leafs = 1 for tree in pa.pando_trees: total_leafs *= len(tree.get_leaves()) return total_leafs else: return 1 except: return 1 def calculate_block_utilization(self, pa): """Calculate block utilization""" if hasattr(pa, "partition_tree") and pa.partition_tree: total_leafs = pa.partition_tree.get_leaves() max_tuples = ( len(total_leafs) * pa.block_size * 2 ) total_tuples = sum([leaf.node_size for leaf in total_leafs]) return total_tuples / max_tuples if max_tuples > 0 else 0 elif hasattr(pa, "pando_trees") and pa.pando_trees: total_tuples, max_tuples = 0, 0 total_leafs = 1 for tree in pa.pando_trees: total_leafs *= len(tree.get_leaves()) max_tuples = total_leafs * pa.block_size * 2 total_tuples = tree.pt_root.node_size return total_tuples / max_tuples if max_tuples > 0 else 0 else: return 0 def run_comparison(self): """Run complete comparison experiment - reuse standard process of PartitionAlgorithm""" print("=== TPC-H Pando vs MTO Layout Comparison ===") tpch_tables = [ "lineitem", "orders", "supplier", "customer", "nation", "region", "part", "partsupp", ] results = [] for table_name in tpch_tables[:1]: print(f"\n--- Analyzing {table_name} ---") try: mto_pa = self.build_mto_layout(table_name) pando_pa = self.build_pando_layout(table_name) mto_scan_ratio = mto_pa.evaluate_single_table_access_cost() pando_scan_ratio = pando_pa.evaluate_pando_cost() mto_tree_count = 1 pando_tree_count = len(pando_pa.pando_trees) mto_blocks = self.count_blocks(mto_pa) pando_blocks = self.count_blocks(pando_pa) mto_utilization = self.calculate_block_utilization(mto_pa) pando_utilization = self.calculate_block_utilization(pando_pa) results.append( { "table": table_name, "mto_scan_ratio": mto_scan_ratio, "pando_scan_ratio": pando_scan_ratio, "mto_trees": mto_tree_count, "pando_trees": pando_tree_count, "mto_blocks": mto_blocks, "pando_blocks": pando_blocks, "mto_utilization": mto_utilization, "pando_utilization": pando_utilization, } ) print( f" Scan Ratio - MTO: {mto_scan_ratio:.4f}, Pando: {pando_scan_ratio:.4f}" ) print( f" Tree Count - MTO: {mto_tree_count}, Pando: {pando_tree_count}" ) print(f" Block Count - MTO: {mto_blocks}, Pando: {pando_blocks}") print( f" Utilization - MTO: {mto_utilization:.4f}, Pando: {pando_utilization:.4f}" ) except Exception as e: print(f" Error processing {table_name}: {e}") raise e if not results: print("No results generated.") df = pd.DataFrame(results) self.print_summary(df) return df def print_summary(self, df): print("\n=== Summary Statistics ===") print(f"Total tables analyzed: {len(df)}") if len(df) > 0: print(f"Average scan ratio - MTO: {df['mto_scan_ratio'].mean():.4f}") print(f"Average scan ratio - Pando: {df['pando_scan_ratio'].mean():.4f}") print(f"Average tree count - MTO: {df['mto_trees'].mean():.2f}") print(f"Average tree count - Pando: {df['pando_trees'].mean():.2f}") print(f"Average block count - MTO: {df['mto_blocks'].mean():.2f}") print(f"Average block count - Pando: {df['pando_blocks'].mean():.2f}") print(f"Average utilization - MTO: {df['mto_utilization'].mean():.4f}") print(f"Average utilization - Pando: {df['pando_utilization'].mean():.4f}") def main(): comparator = PandoMTOComparator(benchmark="tpch", scale_factor=1) comparator.run_comparison() # # try: # # import matplotlib.pyplot as plt # # fig, axes = plt.subplots(2, 2, figsize=(12, 10)) # # # Scan Ratio对比 # # axes[0, 0].bar( # # results["table"], results["mto_scan_ratio"], alpha=0.7, label="MTO" # # ) # # axes[0, 0].bar( # # results["table"], results["pando_scan_ratio"], alpha=0.7, label="Pando" # # ) # # axes[0, 0].set_title("Scan Ratio Comparison") # # axes[0, 0].set_ylabel("Scan Ratio") # # axes[0, 0].legend() # # axes[0, 0].tick_params(axis="x", rotation=45) # # # Block Count对比 # # axes[0, 1].bar(results["table"], results["mto_blocks"], alpha=0.7, label="MTO") # # axes[0, 1].bar( # # results["table"], results["pando_blocks"], alpha=0.7, label="Pando" # # ) # # axes[0, 1].set_title("Block Count Comparison") # # axes[0, 1].set_ylabel("Block Count") # # axes[0, 1].legend() # # axes[0, 1].tick_params(axis="x", rotation=45) # # # Utilization对比 # # axes[1, 0].bar( # # results["table"], results["mto_utilization"], alpha=0.7, label="MTO" # # ) # # axes[1, 0].bar( # # results["table"], results["pando_utilization"], alpha=0.7, label="Pando" # # ) # # axes[1, 0].set_title("Block Utilization Comparison") # # axes[1, 0].set_ylabel("Utilization") # # axes[1, 0].legend() # # axes[1, 0].tick_params(axis="x", rotation=45) # # # Tree Count对比 # # axes[1, 1].bar(results["table"], results["mto_trees"], alpha=0.7, label="MTO") # # axes[1, 1].bar( # # results["table"], results["pando_trees"], alpha=0.7, label="Pando" # # ) # # axes[1, 1].set_title("Tree Count Comparison") # # axes[1, 1].set_ylabel("Tree Count") # # axes[1, 1].legend() # # axes[1, 1].tick_params(axis="x", rotation=45) # # plt.tight_layout() # # plt.savefig(os.path.join(base_dir, "pando_mto_comparison.png")) # # plt.close() # # print("Visualization saved to pando_mto_comparison.png") # except ImportError: # print("matplotlib not available, skipping visualization") if __name__ == "__main__": main()