Pref-Restoration / DiffusionNFT / dataset / convert.py
convert.py
Raw
import json
import os

def convert_json_to_jsonl():
    # 1. Define input and output paths
    input_path = '/data/zgq/yaozhengjian/Datasets/FFHQ_val/CelebA_HQ/captions_lq.json'
    output_dir = '/data/phd/yaozhengjian/Code/RL/ART-FRv2/DiffusionNFT/dataset/restore_face'
    output_file = 'train_metadata.jsonl'
    output_path = os.path.join(output_dir, output_file)

    # 2. Check and create output directory (if it doesn't exist)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    # 3. Read source JSON file
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully read source file, containing {len(data)} items.")
    except FileNotFoundError:
        print(f"Error: File not found {input_path}")
        return
    except json.JSONDecodeError:
        print(f"Error: File {input_path} is not valid JSON format")
        return

    # 4. Process data and write to JSONL file
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for item in data:
            # Extract filename or path
            # Option A: If you want to keep the full absolute path from the source file (recommended, unless you plan to move images)
            image_path = item['image']
            
            # Option B: If you only want the filename (e.g., "validation_104.png"), please uncomment the line below:
            # image_path = os.path.basename(item['image'])

            # Build new dictionary object
            new_entry = {
                "prompt": item['caption'],
                "image": image_path,
                "requirement": "Restore"
            }

            # Write one line of JSON string
            f_out.write(json.dumps(new_entry, ensure_ascii=False) + '\n')

    print(f"Conversion complete! File saved to: {output_path}")
    
    # Print first 3 lines as examples for inspection
    print("\n--- Example of the first 3 lines of the generated file ---")
    with open(output_path, 'r', encoding='utf-8') as f_check:
        for i in range(3):
            line = f_check.readline()
            if not line: break
            print(line.strip())

if __name__ == "__main__":
    convert_json_to_jsonl()