From d0c714ae4afa1c011269a956d6f260f84f77025e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Aug 2022 16:01:56 +0000
Subject: [PATCH 1/6] [Safety Checker] Add Safety Checker Module

---
 scripts/txt2img.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index da77e1a..0af430c 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -16,12 +16,29 @@ from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
 
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v-1-3", use_auth_token=True)
+safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-v-1-3", use_auth_token=True)
 
 def chunk(it, size):
     it = iter(it)
     return iter(lambda: tuple(islice(it, size)), ())
 
 
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
+
+
 def load_model_from_config(config, ckpt, verbose=False):
     print(f"Loading model from {ckpt}")
     pl_sd = torch.load(ckpt, map_location="cpu")
@@ -220,7 +237,9 @@ def main():
     if opt.fixed_code:
         start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
 
+    print("start code", start_code.abs().sum())
     precision_scope = autocast if opt.precision=="autocast" else nullcontext
+    precision_scope = nullcontext
     with torch.no_grad():
         with precision_scope("cuda"):
             with model.ema_scope():
@@ -269,7 +288,11 @@ def main():
                     Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
                     grid_count += 1
 
-                toc = time.time()
+                image = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+
+                # run safety checker
+                safety_checker_input = pipe.feature_extractor(numpy_to_pil(image), return_tensors="pt")
+                image, has_nsfw_concept = pipe.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
 
     print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
           f" \nEnjoy.")

From 536eb1a8ba9e7e3806727cc25331d43b1f076f07 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Aug 2022 18:03:22 +0200
Subject: [PATCH 2/6] Apply suggestions from code review

---
 scripts/txt2img.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index 0af430c..e93cb19 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -237,9 +237,7 @@ def main():
     if opt.fixed_code:
         start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
 
-    print("start code", start_code.abs().sum())
     precision_scope = autocast if opt.precision=="autocast" else nullcontext
-    precision_scope = nullcontext
     with torch.no_grad():
         with precision_scope("cuda"):
             with model.ema_scope():
@@ -288,6 +286,7 @@ def main():
                     Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
                     grid_count += 1
 
+                toc = time.time()
                 image = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
 
                 # run safety checker

From eef5da90dbee4b22bdd864e53726993f98ae3366 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Aug 2022 17:05:39 +0000
Subject: [PATCH 3/6] finish

---
 scripts/txt2img.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index 0af430c..7b15fbd 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -19,8 +19,10 @@ from ldm.models.diffusion.plms import PLMSSampler
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from transformers import AutoFeatureExtractor
 
-feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v-1-3", use_auth_token=True)
-safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-v-1-3", use_auth_token=True)
+# load safety model
+safety_model_id = "CompVis/stable-diffusion-v-1-3"
+safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id, use_auth_token=True)
+safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id, use_auth_token=True)
 
 def chunk(it, size):
     it = iter(it)
@@ -266,16 +268,23 @@ def main():
 
                         x_samples_ddim = model.decode_first_stage(samples_ddim)
                         x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+
+                        x_image = x_samples_ddim
+                        safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
+                        x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
+
+                        x_checked_image_torch = torch.from_numpy(x_checked_image).permute(0, 3, 2, 1)
 
                         if not opt.skip_save:
-                            for x_sample in x_samples_ddim:
+                            for x_sample in x_checked_image_torch:
                                 x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
                                 Image.fromarray(x_sample.astype(np.uint8)).save(
                                     os.path.join(sample_path, f"{base_count:05}.png"))
                                 base_count += 1
 
                         if not opt.skip_grid:
-                            all_samples.append(x_samples_ddim)
+                            all_samples.append(x_checked_image_torch)
 
                 if not opt.skip_grid:
                     # additionally, save as grid
@@ -288,12 +297,6 @@ def main():
                     Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
                     grid_count += 1
 
-                image = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
-
-                # run safety checker
-                safety_checker_input = pipe.feature_extractor(numpy_to_pil(image), return_tensors="pt")
-                image, has_nsfw_concept = pipe.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
-
     print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
           f" \nEnjoy.")
 

From 239ed0fd0247807df4f62e5dc22dfe00f6e14db3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Aug 2022 17:09:41 +0000
Subject: [PATCH 4/6] fix more

---
 scripts/txt2img.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index add30f5..9613689 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -266,7 +266,7 @@ def main():
 
                         x_samples_ddim = model.decode_first_stage(samples_ddim)
                         x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1)
 
                         x_image = x_samples_ddim
                         safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
@@ -295,11 +295,6 @@ def main():
                     Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
                     grid_count += 1
 
-                image = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
-
-                # run safety checker
-                safety_checker_input = pipe.feature_extractor(numpy_to_pil(image), return_tensors="pt")
-                image, has_nsfw_concept = pipe.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
                 toc = time.time()
 
     print(f"Your samples are ready and waiting for you here: \n{outpath} \n"

From b9851783e5400900a5de8ad43cb6f407f2cd1e4d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Aug 2022 17:14:29 +0000
Subject: [PATCH 5/6] fix to numpy

---
 scripts/txt2img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index 9613689..1b8888f 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -266,7 +266,7 @@ def main():
 
                         x_samples_ddim = model.decode_first_stage(samples_ddim)
                         x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1)
+                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
 
                         x_image = x_samples_ddim
                         safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")

From 673b0ab3a303fa539f950db00525f2e2bb0ce60e Mon Sep 17 00:00:00 2001
From: Patrick Esser <patrick.esser@gmx.net>
Date: Mon, 22 Aug 2022 10:58:42 +0000
Subject: [PATCH 6/6] update safety model id

---
 scripts/txt2img.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index 1b8888f..bf8b4a0 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -20,9 +20,9 @@ from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionS
 from transformers import AutoFeatureExtractor
 
 # load safety model
-safety_model_id = "CompVis/stable-diffusion-v-1-3"
-safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id, use_auth_token=True)
-safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id, use_auth_token=True)
+safety_model_id = "CompVis/stable-diffusion-safety-checker"
+safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
+safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
 
 def chunk(it, size):
     it = iter(it)
@@ -272,7 +272,7 @@ def main():
                         safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
                         x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
 
-                        x_checked_image_torch = torch.from_numpy(x_checked_image).permute(0, 3, 2, 1)
+                        x_checked_image_torch = torch.from_numpy(x_checked_image).permute(0, 3, 1, 2)
 
                         if not opt.skip_save:
                             for x_sample in x_checked_image_torch: