prg-lang-2 / final / combine / combine.py
combine.py
Raw
import re
import argparse
from pathlib import Path
from collections import defaultdict, deque
from typing import List, Set, Tuple, Dict

def parse_dependencies(files: List[Path], root_path: Path) -> Tuple[Dict[Path, Set[Path]], Dict[Path, Set[Path]]]:
    """Parse header dependencies between files."""
    dependencies = defaultdict(set)
    reverse_dependencies = defaultdict(set)
    include_pattern = re.compile(r'#include "(.*\.h)"')
    for file in files:
        with file.open() as infile:
            for line in infile:
                match = include_pattern.search(line)
                if match:
                    included_file = root_path / match.group(1)
                    if included_file.exists():
                        dependencies[file].add(included_file)
                        reverse_dependencies[included_file].add(file)
    return dependencies, reverse_dependencies

def topological_sort(files: List[Path], dependencies: Dict[Path, Set[Path]]) -> List[Path]:
    """Sort files based on their dependencies."""
    indegree = defaultdict(int)
    for file in files:
        for dep in dependencies[file]:
            indegree[dep] += 1
    queue = deque([file for file in files if indegree[file] == 0])
    sorted_files = []
    while queue:
        file = queue.popleft()
        sorted_files.append(file)
        for dep in dependencies[file]:
            indegree[dep] -= 1
            if indegree[dep] == 0:
                queue.append(dep)
    return sorted_files

def process_file(fname: Path) -> Tuple[List[str], List[str]]:
    """Process a single file, separating includes and other code."""
    with fname.open() as infile:
        lines = [line.rstrip('\n') for line in infile]
    include_lines = []
    processed_lines = []
    inside_guard = False
    preprocessor_stack = []
    in_include_section = True

    i = 0
    while i < len(lines):
        line = lines[i]

        if re.match(r"#ifndef .*_H", line):
            inside_guard = True
            i += 1
            continue
        if inside_guard and re.match(r"#define .*_H", line):
            i += 1
            continue
        if inside_guard and line.strip() == "#endif" and not preprocessor_stack:
            inside_guard = False
            i += 1
            continue

        if in_include_section and line and not line.startswith("#"):
            in_include_section = False

        if in_include_section:
            if line.startswith("#if") or re.match(r"#ifdef|#ifndef", line):
                block_start = i
                depth = 1
                found_include = False

                while i + 1 < len(lines) and depth > 0:
                    i += 1
                    next_line = lines[i]
                    if next_line.startswith("#if"):
                        depth += 1
                    elif next_line.strip() == "#endif":
                        depth -= 1
                    elif next_line.startswith("#include"):
                        found_include = True

                if found_include:
                    include_lines.extend(lines[block_start:i+1])
                else:
                    processed_lines.extend(lines[block_start:i+1])
                i += 1
                continue

            elif line.startswith("#include"):
                if not re.match(r'#include ".*\.h"', line):
                    include_lines.append(line)
                i += 1
                continue

        processed_lines.append(line)
        i += 1

    return include_lines, processed_lines

def combine_files(output_file: Path, h_files: List[Path], c_files: List[Path], root_path: Path, start_file: str = None) -> None:
    """
    Combine multiple C source files into a single file.

    Args:
        output_file: Path to the output file
        h_files: List of header files
        c_files: List of C source files
        root_path: Root path for resolving includes
        start_file: Name of the file to process last (e.g., "main.c")
    """
    _, reverse_dependencies = parse_dependencies(h_files, root_path)
    sorted_h_files = topological_sort(h_files, reverse_dependencies)

    all_includes = []
    all_code = []
    main_file = None

    if start_file:
        for file in c_files:
            if file.name == start_file:
                main_file = file
                c_files.remove(file)
                break

    with output_file.open("w", encoding="utf-8") as outfile:
        for fname in (sorted_h_files + c_files):
            include_lines, processed_lines = process_file(fname)
            all_includes.extend(line for line in include_lines if line not in all_includes)
            all_code.append(f"//{'-' * 20} {fname.name} {'-' * 20}//\n")
            all_code.extend("\n".join(processed_lines).strip().split("\n"))
            all_code.append("")

        if main_file is not None:
            include_lines, processed_lines = process_file(main_file)
            all_includes.extend(line for line in include_lines if line not in all_includes)
            all_code.append(f"//{'-' * 20} {main_file.name} {'-' * 20}//\n")
            all_code.extend("\n".join(processed_lines).strip().split("\n"))
            all_code.append("")

        outfile.write("\n".join(all_includes))
        outfile.write("\n\n")
        outfile.write("\n".join(all_code))

def find_source_files(target_dir: Path) -> Tuple[List[Path], List[Path]]:
    """
    Find .c and .h files recursively in the target directory,
    excluding directories containing combine.py
    """
    h_files = []
    c_files = []

    excluded_dirs = set()
    for path in target_dir.rglob(Path(__file__).name):
        excluded_dirs.add(path.parent)

    for path in target_dir.rglob("*.[ch]"):
        if any(excl_dir in path.parents for excl_dir in excluded_dirs):
            continue

        if path.suffix == '.h':
            h_files.append(path)
        else:
            c_files.append(path)

    return h_files, c_files

def main():
    """Main entry point of the script."""
    default_output_file = Path(__file__).parent / "main.c"

    parser = argparse.ArgumentParser(description="Combine C source files into a single file")
    parser.add_argument('--target-dir', '-d', type=Path, help='Target directory containing source files', required=True)
    parser.add_argument('--output-file', '-o', type=Path, help='Output file path', default=default_output_file)
    parser.add_argument('--start-file', '-s', type=str, help='File to process last (e.g., main.c)')

    args = parser.parse_args()

    h_files, c_files = find_source_files(args.target_dir)

    if not h_files and not c_files:
        print(f"No source files found in {args.target_dir}")
        return

    start_file = args.start_file
    if start_file is None and (args.target_dir / "main.c").exists():
        start_file = "main.c"
        print(f"main.c was specified as start_file in {args.target_dir}")

    combine_files(args.output_file, h_files, c_files, args.target_dir, start_file)

if __name__ == "__main__":
    main()