PAC-tree / model / test_bounding_split.py
test_bounding_split.py
Raw
from partition_algorithm import PartitionAlgorithm
import logging
from colorlog import ColoredFormatter

handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
formatter = ColoredFormatter(
    "%(log_color)s%(levelname)-8s:%(name)s:%(message)s",
    log_colors={
        "DEBUG": "cyan",
        "INFO": "green",
        "WARNING": "yellow",
        "ERROR": "red",
        "CRITICAL": "bold_red",
    },
)
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.handlers = []
logger.addHandler(handler)


def test_bounding_split_effect():
    """
    Test the effect of bounding split when enabled and disabled
    """
    benchmark = "tpch"
    benchmark_dict = {
        "tpch": [
            "lineitem",
            "orders",
            "customer",
            # "nation",
            # "region",
            # "part",
            # "supplier",
            # "partsupp",
        ],
        "imdb": ["title", "movie_companies", "cast_info", "name"],
        "tpcds": [
            "store", 
            "item",
            "household_demographics", 
            "customer", 
        ],
    }

    trees = {}
    pa = PartitionAlgorithm(benchmark=benchmark)

    pa.load_join_query(join_indeuced="PAW")
    cost_dict = dict()

    table_list = set()
    for join_query in pa.join_queries:
        if join_query["join_relations"]:
            for join_op in join_query["join_relations"]:
                for join_table, join_col in join_op.items():
                    table_list.add(join_table)

    for tablename in benchmark_dict[benchmark]:
        pa.table_name = tablename
        pa.load_data()
        pa.load_query(join_indeuced="PAW")

        for if_bounding_split in [True, False]:
            pa.InitializeWithJT(
                    enable_bounding_split=if_bounding_split, enable_median_extend=False
                )
            bounding_flag = 1 if if_bounding_split else 0

            cost_dict.setdefault(bounding_flag, {})
            trees.setdefault(bounding_flag, {})
            trees[bounding_flag][tablename] = pa.partition_tree
            trees[bounding_flag][tablename].name = "PAC-Tree"
            tree_depth = pa.evaluate_tree_depth(pa.partition_tree.pt_root, 0)
            tot_cost = pa.evaluate_single_table_access_cost()
            cost_dict[bounding_flag][tablename] = tot_cost
            logging.info(
                f"enable bounding: {if_bounding_split}, {cost_dict[bounding_flag][tablename]}, max_depth:{tree_depth}"
            )

    for bounding_flag in cost_dict.keys():
        avg_cost = sum(cost_dict[bounding_flag].values()) / len(cost_dict[bounding_flag])
        logger.info(
            f"Average cost for bounding split {'enabled' if bounding_flag == 1 else 'disabled'}: {avg_cost}"
        )
if __name__ == "__main__":
    test_bounding_split_effect()

"""
bounding split test result:
_______imdb_______________
___table __enable __disable
title: 0.022  0.02495
movie_companies:  0.04585  0.069358
cast_info: 0.0083 0.0083
name: 0.0414 0.041


________tpch______________
___table __enable __disable
lineitem:0.313  0.313
orders: 0.0527 0.0687
customer:0.070 0.07009


_______tpcds_______________
store: 0.17105 0.17105
item: 0.1307 0.1307
household_demographics: 0.4026 0.4026
customer: 0.314 0.3148

"""