import os
import torch
import torch.nn as nn
from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor
import torchvision.transforms.functional as F
import numpy as np
from PIL import Image
from hpsv2.src.open_clip import create_model, get_tokenizer
from flow_grpo.reward_ckpt_path import CKPT_PATH
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
class ResizeMaxSize(nn.Module):
def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0):
super().__init__()
if not isinstance(max_size, int):
raise TypeError(f"Size should be int. Got {type(max_size)}")
self.max_size = max_size
self.interpolation = interpolation
self.fn = min if fn == "min" else min # Note: both 'min' and 'max' map to min
self.fill = fill
def forward(self, img):
if isinstance(img, torch.Tensor):
# Assuming NCHW, get H and W from the last two dimensions
height, width = img.shape[-2:]
else:
width, height = img.size
scale = self.max_size / float(max(height, width))
if scale != 1.0:
new_size = tuple(round(dim * scale) for dim in (height, width))
img = F.resize(img, new_size, self.interpolation)
pad_h = self.max_size - new_size[0]
pad_w = self.max_size - new_size[1]
img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill)
return img
class MaskAwareNormalize(nn.Module):
def __init__(self, mean, std):
super().__init__()
self.normalize = Normalize(mean=mean, std=std)
def forward(self, tensor):
# Assuming NCHW, check the channel dimension
if tensor.shape[1] == 4:
# Process each image in the batch
normalized_parts = []
for i in range(tensor.shape[0]):
img_slice = tensor[i]
normalized_rgb = self.normalize(img_slice[:3])
alpha_channel = img_slice[3:]
normalized_parts.append(torch.cat([normalized_rgb, alpha_channel], dim=0))
return torch.stack(normalized_parts, dim=0)
else:
return self.normalize(tensor)
def image_transform_tensor(
image_size: int,
mean: tuple = None,
std: tuple = None,
fill_color: int = 0,
):
mean = mean or OPENAI_DATASET_MEAN
std = std or OPENAI_DATASET_STD
if not isinstance(mean, (list, tuple)):
mean = (mean,) * 3
if not isinstance(std, (list, tuple)):
std = (std,) * 3
normalize = MaskAwareNormalize(mean=mean, std=std)
transforms = [
ResizeMaxSize(image_size, fill=fill_color),
normalize,
]
return Compose(transforms)
class HPSv2Scorer(nn.Module):
def __init__(self, dtype, device):
super().__init__()
self.dtype = dtype
self.device = device
model = create_model(
"ViT-H-14",
os.path.join(CKPT_PATH, "open_clip_pytorch_model.bin"),
precision="amp",
device=device,
jit=False,
force_quick_gelu=False,
force_custom_text=False,
force_patch_dropout=False,
force_image_size=None,
pretrained_image=False,
output_dict=True,
)
image_mean = getattr(model.visual, "image_mean", None)
image_std = getattr(model.visual, "image_std", None)
image_size = model.visual.image_size
if isinstance(image_size, tuple):
image_size = image_size[0]
preprocess_val = image_transform_tensor(
image_size,
mean=image_mean,
std=image_std,
)
self.model = model.to(device)
self.preprocess_val = preprocess_val
checkpoint = torch.load(os.path.join(CKPT_PATH, "HPS_v2.1_compressed.pt"), map_location="cpu")
self.model.load_state_dict(checkpoint["state_dict"])
self.processor = get_tokenizer("ViT-H-14")
self.eval()
@torch.no_grad()
def __call__(self, images, prompts):
image = self.preprocess_val(images.to(self.dtype).to(device=self.device, non_blocking=True))
# Process the prompt
text = self.processor(prompts).to(device=self.device, non_blocking=True)
outputs = self.model(image, text)
image_features, text_features = outputs["image_features"], outputs["text_features"]
logits_per_image = image_features @ text_features.T
hps_score = torch.diagonal(logits_per_image, 0)
return hps_score.contiguous()
def main():
scorer = HPSv2Scorer(dtype=torch.float32, device="cuda")
images = [
"test_cases/nasa.jpg",
"test_cases/hello world.jpg",
]
pil_images = [Image.open(img) for img in images]
prompts = [
'An astronaut’s glove floating in zero-g with "NASA 2049" on the wrist',
'New York Skyline with "Hello World" written with fireworks on the sky',
]
images = [np.array(img) for img in pil_images]
images = np.array(images)
images = images.transpose(0, 3, 1, 2) # NHWC -> NCHW
images = torch.tensor(images, dtype=torch.uint8) / 255.0
print(scorer(images, prompts))
if __name__ == "__main__":
main()