commit cd1e2756bcb9c8b2a35402e388639e880170e06f Author: uvos Date: Fri Jun 14 08:54:09 2024 +0200 initial commit diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..7cd1ebf --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 3.6) +project(ImageAiUtils) + +set(CMAKE_CXX_STANDARD 17) +set(WEIGHT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/Weights) + +add_subdirectory(SmartCrop) diff --git a/DanbooruTagger/DanbooruTagger.py b/DanbooruTagger/DanbooruTagger.py new file mode 100644 index 0000000..0ebeee1 --- /dev/null +++ b/DanbooruTagger/DanbooruTagger.py @@ -0,0 +1,105 @@ +import warnings +from deepdanbooru_onnx import DeepDanbooru +from PIL import Image +import argparse +import cv2 +import os +from multiprocessing import Process, Queue +import json +from tqdm import tqdm + + +image_ext_ocv = [".bmp", ".jpeg", ".jpg", ".png"] + + +def find_image_files(path: str) -> list[str]: + paths = list() + for root, dirs, files in os.walk(path): + for filename in files: + name, extension = os.path.splitext(filename) + if extension.lower() in image_ext_ocv: + paths.append(os.path.join(root, filename)) + return paths + + +def image_loader(paths: list[str]): + for path in paths: + name, extension = os.path.splitext(path) + extension = extension.lower() + imagebgr = cv2.imread(path) + image = cv2.cvtColor(imagebgr, cv2.COLOR_BGR2RGB) + if image is None: + print(f"Warning: could not load {path}") + else: + image_pil = Image.fromarray(image) + yield image_pil, path + + +def pipeline(queue: Queue, image_paths: list[str], device: int): + danbooru = DeepDanbooru() + + for path in image_paths: + imageprompt = "" + tags = danbooru(path) + for tag in tags: + imageprompt = imageprompt + ", " + tag + + queue.put({"file_name": path, "text": imageprompt}) + + +def split_list(input_list, count): + target_length = int(len(input_list) / count) + for i in range(0, count - 1): + yield input_list[i * target_length: (i + 1) * target_length] + yield input_list[(count - 1) * target_length: len(input_list)] + + +def save_meta(meta_file, meta, reldir, common_description): + meta["file_name"] = os.path.relpath(meta["file_name"], reldir) + if common_description is not None: + meta["text"] = common_description + meta["text"] + meta_file.write(json.dumps(meta) + '\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("A script to tag images via DeepDanbooru") + parser.add_argument('--batch', '-b', default=4, type=int, help="Batch size to use for inference") + parser.add_argument('--common_description', '-c', help="An optional description that will be preended to the ai generated one") + parser.add_argument('--image_dir', '-i', help="A directory containg the images to tag") + args = parser.parse_args() + + nparalell = 2 + + image_paths = find_image_files(args.image_dir) + image_path_chunks = list(split_list(image_paths, nparalell)) + + print(f"Will use {nparalell} processies to create tags") + + queue = Queue() + processies = list() + for i in range(0, nparalell): + processies.append(Process(target=pipeline, args=(queue, image_path_chunks[i], i))) + processies[-1].start() + + progress = tqdm(desc="Generateing tags", total=len(image_paths)) + exit = False + with open(os.path.join(args.image_dir, "metadata.jsonl"), mode='w') as output_file: + while not exit: + if not queue.empty(): + meta = queue.get() + save_meta(output_file, meta, args.image_dir, args.common_description) + progress.update() + exit = True + for process in processies: + if process.is_alive(): + exit = False + break + + while not queue.empty(): + meta = queue.get() + save_meta(output_file, meta, args.image_dir, args.common_description) + progress.update() + + for process in processies: + process.join() + diff --git a/DanbooruTagger/deepdanbooru_onnx/__init__.py b/DanbooruTagger/deepdanbooru_onnx/__init__.py new file mode 100644 index 0000000..21d7e94 --- /dev/null +++ b/DanbooruTagger/deepdanbooru_onnx/__init__.py @@ -0,0 +1,3 @@ +from .deepdanbooru_onnx import DeepDanbooru +from .deepdanbooru_onnx import process_image +__version__ = '0.0.8' \ No newline at end of file diff --git a/DanbooruTagger/deepdanbooru_onnx/__pycache__/__init__.cpython-312.pyc b/DanbooruTagger/deepdanbooru_onnx/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..6edaed3 Binary files /dev/null and b/DanbooruTagger/deepdanbooru_onnx/__pycache__/__init__.cpython-312.pyc differ diff --git a/DanbooruTagger/deepdanbooru_onnx/__pycache__/deepdanbooru_onnx.cpython-312.pyc b/DanbooruTagger/deepdanbooru_onnx/__pycache__/deepdanbooru_onnx.cpython-312.pyc new file mode 100644 index 0000000..e5960a9 Binary files /dev/null and b/DanbooruTagger/deepdanbooru_onnx/__pycache__/deepdanbooru_onnx.cpython-312.pyc differ diff --git a/DanbooruTagger/deepdanbooru_onnx/deepdanbooru_onnx.py b/DanbooruTagger/deepdanbooru_onnx/deepdanbooru_onnx.py new file mode 100644 index 0000000..c108ceb --- /dev/null +++ b/DanbooruTagger/deepdanbooru_onnx/deepdanbooru_onnx.py @@ -0,0 +1,244 @@ +import onnxruntime as ort +from PIL import Image +import numpy as np +import os +from tqdm import tqdm +import requests +import hashlib +from typing import List, Union +import shutil +from pathlib import Path + + +def process_image(image: Image.Image) -> np.ndarray: + """ + Convert an image to a numpy array. + :param image: the image to convert + :return: the numpy array + """ + + image = image.convert("RGB").resize((512, 512)) + image = np.array(image).astype(np.float32) / 255 + image = image.transpose((2, 0, 1)).reshape(1, 3, 512, 512).transpose((0, 2, 3, 1)) + return image + + +def download(url: str, save_path: str, md5: str, length: str) -> bool: + """ + Download a file from url to save_path. + If the file already exists, check its md5. + If the md5 matches, return True,if the md5 doesn't match, return False. + :param url: the url of the file to download + :param save_path: the path to save the file + :param md5: the md5 of the file + :param length: the length of the file + :return: True if the file is downloaded successfully, False otherwise + """ + + try: + response = requests.get(url=url, stream=True) + with open(save_path, "wb") as f: + with tqdm.wrapattr( + response.raw, "read", total=length, desc="Downloading" + ) as r_raw: + shutil.copyfileobj(r_raw, f) + return ( + True + if hashlib.md5(open(save_path, "rb").read()).hexdigest() == md5 + else False + ) + except Exception as e: + print(e) + return False + + +def download_model(): + """ + Download the model and tags file from the server. + :return: the path to the model and tags file + """ + + model_url = ( + "https://huggingface.co/chinoll/deepdanbooru/resolve/main/deepdanbooru.onnx" + ) + tags_url = "https://huggingface.co/chinoll/deepdanbooru/resolve/main/tags.txt" + model_md5 = "16be4e40ebcc0b1d1915bbf31f00969f" + tags_md5 = "a3f764de985cdeba89f1d232a4204402" + model_length = 643993025 + tags_length = 133810 + + home = str(Path.home()) + "/.deepdanbooru_onnx/" + if not os.path.exists(home): + os.mkdir(home) + + model_name = "deepdanbooru.onnx" + tags_name = "tags.txt" + + model_path = home + model_name + tags_path = home + tags_name + if os.path.exists(model_path): + if hashlib.md5(open(model_path, "rb").read()).hexdigest() != model_md5: + os.remove(model_path) + if not download(model_url, model_path, model_md5, model_length): + raise ValueError("Model download failed") + + else: + if not download(model_url, model_path, model_md5, model_length): + raise ValueError("Model download failed") + + if os.path.exists(tags_path): + if hashlib.md5(open(tags_path, "rb").read()).hexdigest() != tags_md5: + os.remove(tags_path) + if not download(tags_url, tags_path, tags_md5, tags_length): + raise ValueError("Tags download failed") + else: + if not download(tags_url, tags_path, tags_md5, tags_length): + raise ValueError("Tags download failed") + return model_path, tags_path + + +class DeepDanbooru: + def __init__( + self, + mode: str = "auto", + model_path: Union[str, None] = None, + tags_path: Union[str, None] = None, + threshold: Union[float, int] = 0.6, + pin_memory: bool = False, + batch_size: int = 1, + ): + """ + Initialize the DeepDanbooru class. + :param mode: the mode of the model, "cpu" or "gpu" or "auto" + :param model_path: the path to the model file + :param tags_path: the path to the tags file + :param threshold: the threshold of the model + :param pin_memory: whether to use pin memory + :param batch_size: the batch size of the model + """ + + providers = { + "cpu": "CPUExecutionProvider", + "gpu": "CUDAExecutionProvider", + "tensorrt": "TensorrtExecutionProvider", + "auto": ( + "CUDAExecutionProvider" + if "CUDAExecutionProvider" in ort.get_available_providers() + else "CPUExecutionProvider" + ), + } + + if not (isinstance(threshold, float) or isinstance(threshold, int)): + raise TypeError("threshold must be float or int") + if threshold < 0 or threshold > 1: + raise ValueError("threshold must be between 0 and 1") + if mode not in providers: + raise ValueError( + "Mode not supported. Please choose from: cpu, gpu, tensorrt" + ) + if providers[mode] not in ort.get_available_providers(): + raise ValueError( + f"Your device is not supported {mode}. Please choose from: cpu" + ) + if model_path is not None and not os.path.exists(model_path): + raise FileNotFoundError("Model file not found") + if tags_path is not None and not os.path.exists(tags_path): + raise FileNotFoundError("Tags file not found") + + if model_path is None or tags_path is None: + model_path, tags_path = download_model() + + self.session = ort.InferenceSession(model_path, providers=[providers[mode]]) + self.tags = [i.replace("\n", "") for i in open(tags_path, "r").readlines()] + + self.input_name = self.session.get_inputs()[0].name + self.output_name = [output.name for output in self.session.get_outputs()] + self.threshold = threshold + self.pin_memory = pin_memory + self.batch_size = batch_size + self.mode = mode + self.cache = {} + + def __str__(self) -> str: + return f"DeepDanbooru(mode={self.mode}, threshold={self.threshold}, pin_memory={self.pin_memory}, batch_size={self.batch_size})" + + def __repr__(self) -> str: + return self.__str__() + + def from_image_inference(self, image: Image.Image) -> dict: + image = process_image(image) + return self.predict(image) + + def from_ndarray_inferece(self, image: np.ndarray) -> dict: + if image.shape != (1, 512, 512, 3): + raise ValueError(f"Image must be {(1, 512, 512, 3)}") + return self.predict(image) + + def from_file_inference(self, image: str) -> dict: + return self.from_image_inference(Image.open(image)) + + def from_list_inference(self, image: Union[list, tuple]) -> List[dict]: + if self.pin_memory: + image = [process_image(Image.open(i)) for i in image] + for i in [ + image[i : i + self.batch_size] + for i in range(0, len(image), self.batch_size) + ]: + imagelist = i + bs = len(i) + _imagelist, idx, hashlist = [], [], [] + for j in range(len(i)): + img = Image.open(i[j]) if not self.pin_memory else imagelist[j] + image_hash = hashlib.md5(np.array(img).astype(np.uint8)).hexdigest() + hashlist.append(image_hash) + if image_hash in self.cache: + continue + if not self.pin_memory: + _imagelist.append(process_image(img)) + else: + _imagelist.append(imagelist[j]) + idx.append(j) + + imagelist = _imagelist + if len(imagelist) != 0: + _image = np.vstack(imagelist) + results = self.inference(_image) + results_idx = 0 + else: + results = [] + + for i in range(bs): + image_tag = {} + if i in idx: + hash = hashlist[i] + for tag, score in zip(self.tags, results[results_idx]): + if score >= self.threshold: + image_tag[tag] = score + results_idx += 1 + self.cache[hash] = image_tag + yield image_tag + else: + yield self.cache[hashlist[i]] + + def inference(self, image): + return self.session.run(self.output_name, {self.input_name: image})[0] + + def predict(self, image): + result = self.inference(image) + image_tag = {} + for tag, score in zip(self.tags, result[0]): + if score >= self.threshold: + image_tag[tag] = score + return image_tag + + def __call__(self, image) -> Union[dict, List[dict]]: + if isinstance(image, str): + return self.from_file_inference(image) + elif isinstance(image, np.ndarray): + return self.from_ndarray_inferece(image) + elif isinstance(image, list) or isinstance(image, tuple): + return self.from_list_inference(image) + elif isinstance(image, Image.Image): + return self.from_image_inference(image) + else: + raise ValueError("Image must be a file path or a numpy array or list/tuple") diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/LLavaTagger/LLavaTagger.py b/LLavaTagger/LLavaTagger.py new file mode 100644 index 0000000..355121b --- /dev/null +++ b/LLavaTagger/LLavaTagger.py @@ -0,0 +1,142 @@ +import warnings +warnings.simplefilter(action='ignore') +from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig, logging +import argparse +import cv2 +import torch +import os +import numpy +from typing import Iterator +from torch.multiprocessing import Process, Queue +import json +from tqdm import tqdm + + +image_ext_ocv = [".bmp", ".jpeg", ".jpg", ".png"] + + +def find_image_files(path: str) -> list[str]: + paths = list() + for root, dirs, files in os.walk(path): + for filename in files: + name, extension = os.path.splitext(filename) + if extension.lower() in image_ext_ocv: + paths.append(os.path.join(root, filename)) + return paths + + +def image_loader(paths: list[str]) -> Iterator[numpy.ndarray]: + for path in paths: + name, extension = os.path.splitext(path) + extension = extension.lower() + imagebgr = cv2.imread(path) + image = cv2.cvtColor(imagebgr, cv2.COLOR_BGR2RGB) + if image is None: + print(f"Warning: could not load {path}") + else: + yield image, path + + +def pipeline(queue: Queue, image_paths: list[str], prompt: str, device: torch.device, model_name_or_path: str, batch_size: int): + model = LlavaForConditionalGeneration.from_pretrained(model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=None, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type='nf4', + ), device_map=device, attn_implementation="flash_attention_2") + processor = AutoProcessor.from_pretrained(model_name_or_path) + image_generator = image_loader(image_paths) + + stop = False + finished_count = 0 + while not stop: + prompts = list() + images = list() + filenames = list() + for i in range(0, batch_size): + image, filename = next(image_generator, (None, None)) + if image is None: + stop = True + break + + filenames.append(filename) + images.append(image) + prompts.append(prompt) + + if len(images) == 0: + break + + inputs = processor(text=prompts, images=images, return_tensors="pt").to(model.device) + generate_ids = model.generate(**inputs, max_new_tokens=100, min_new_tokens=3, length_penalty=1.0, do_sample=False, temperature=1.0, top_k=50, top_p=1.0) + decodes = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + finished_count += len(images) + for i, decoded in enumerate(decodes): + trim = len(prompt) - len("") + queue.put({"file_name": filenames[i], "text": decoded[trim:].strip()}) + + +def split_list(input_list, count): + target_length = int(len(input_list) / count) + for i in range(0, count - 1): + yield input_list[i * target_length: (i + 1) * target_length] + yield input_list[(count - 1) * target_length: len(input_list)] + + +def save_meta(meta_file, meta, reldir, common_description): + meta["file_name"] = os.path.relpath(meta["file_name"], reldir) + if common_description is not None: + meta["text"] = common_description + meta["text"] + meta_file.write(json.dumps(meta) + '\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("A script to tag images via llava") + parser.add_argument('--model', '-m', default="llava-hf/llava-1.5-13b-hf", help="model to use") + parser.add_argument('--quantize', '-q', action='store_true', help="load quantized") + parser.add_argument('--prompt', '-p', default="Please describe this image in 10 to 20 words.", help="Prompt to use on eatch image") + parser.add_argument('--batch', '-b', default=4, type=int, help="Batch size to use for inference") + parser.add_argument('--common_description', '-c', help="An optional description that will be preended to the ai generated one") + parser.add_argument('--image_dir', '-i', required=True, help="A directory containg the images to tag") + args = parser.parse_args() + + prompt = "USER: \n" + args.prompt + "\nASSISTANT: " + os.environ["BITSANDBYTES_NOWELCOME"] = "1" + + image_paths = find_image_files(args.image_dir) + image_path_chunks = list(split_list(image_paths, torch.cuda.device_count())) + + print(f"Will use {torch.cuda.device_count()} processies to create tags") + + logging.set_verbosity_error() + warnings.filterwarnings("ignore") + torch.multiprocessing.set_start_method('spawn') + + queue = Queue() + processies = list() + for i in range(0, torch.cuda.device_count()): + processies.append(Process(target=pipeline, args=(queue, image_path_chunks[i], prompt, torch.device(i), args.model, args.batch))) + processies[-1].start() + + progress = tqdm(desc="Generateing tags", total=len(image_paths)) + exit = False + with open(os.path.join(args.image_dir, "metadata.jsonl"), mode='w') as output_file: + while not exit: + if not queue.empty(): + meta = queue.get() + save_meta(output_file, meta, args.image_dir, args.common_description) + progress.update() + exit = True + for process in processies: + if process.is_alive(): + exit = False + break + + while not queue.empty(): + meta = queue.get() + save_meta(output_file, meta, args.image_dir, args.common_description) + progress.update() + + for process in processies: + process.join() + diff --git a/LLavaTagger/README.md b/LLavaTagger/README.md new file mode 100644 index 0000000..c9381e4 --- /dev/null +++ b/LLavaTagger/README.md @@ -0,0 +1,21 @@ +# LLavaTagger + +LLavaTagger is a python script that tags images based on a given prompt using the [LLaVA](https://llava-vl.github.io/) multi modal llm. LLavaTagger supports using any number of gpus in ddp parralel for this task. + +## How to use + +first create a python venv and install the required packages into it: + + $ python -m venv venv + $ source venv/bin/activate + $ pip install -r requirements.txt + +Then run LLavaTagger for instance like so: + + $ python LLavaTagger.py --common_description "a image of a cat, " --prompt "describe the cat in 10 to 20 words" --batch 8 --quantize --image_dir ~/cat_images + +By default LLavaTagger will run in parallel on all available gpus, if this is undesriable please use the ROCR_VISIBLE_DEVICES= or CUDA_VISIBLE_DEVICES= environment variable to hide unwanted gpus + +LLavaTagger will then create a meta.jsonl in the image directory sutable to be used by the scripts of [diffusers](https://github.com/huggingface/diffusers) to train stable diffusion (xl) if other formats are desired ../utils contains scripts to transform the metadata into other formats for instace for the use with [kohya](https://github.com/bmaltais/kohya_ss) + +If editing the created tags is desired, [QImageTagger](https://uvos.xyz/git/uvos/QImageTagger) can be used for this purpose diff --git a/LLavaTagger/requirements.txt b/LLavaTagger/requirements.txt new file mode 100644 index 0000000..c4e265c --- /dev/null +++ b/LLavaTagger/requirements.txt @@ -0,0 +1,11 @@ +accelerate==0.29.0 +bitsandbytes +huggingface-hub==0.22.2 +ninja==1.11.1.1 +safetensors==0.4.2 +tokenizers==0.15.2 +transformers +torch +opencv-python +numpy +tqdm diff --git a/PersonDatasetAssembler/PersonDatasetAssembler.py b/PersonDatasetAssembler/PersonDatasetAssembler.py new file mode 100755 index 0000000..8b365a5 --- /dev/null +++ b/PersonDatasetAssembler/PersonDatasetAssembler.py @@ -0,0 +1,174 @@ +#!/bin/python3 + +# PersonDatasetAssembler - A tool to assmble images of a specific person from a +# directory of images or from a video file +# Copyright (C) 2024 Carl Philipp Klemm +# +# This file is part of PersonDatasetAssembler. +# +# PersonDatasetAssembler is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PersonDatasetAssembler is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PersonDatasetAssembler. If not, see . + +import argparse +import os +from typing import Iterator +import cv2 +import numpy +from tqdm import tqdm +from wand.exceptions import BlobError +from wand.image import Image + +image_ext_ocv = [".bmp", ".jpeg", ".jpg", ".png"] +image_ext_wand = [".dng", ".arw"] + + +class LoadException(Exception): + pass + + +def find_image_files(path: str) -> list[str]: + paths = list() + for root, dirs, files in os.walk(path): + for filename in files: + name, extension = os.path.splitext(filename) + if extension.lower() in image_ext_ocv or extension in image_ext_wand: + paths.append(os.path.join(root, filename)) + return paths + + +def image_loader(paths: list[str]) -> Iterator[numpy.ndarray]: + for path in paths: + name, extension = os.path.splitext(path) + extension = extension.lower() + if extension in image_ext_ocv: + image = cv2.imread(path) + if image is None: + print(f"Warning: could not load {path}") + else: + yield image + elif extension in image_ext_wand: + try: + image = Image(filename=path) + except BlobError as e: + print(f"Warning: could not load {path}, {e}") + continue + + +def extract_video_images(video: cv2.VideoCapture, interval: int = 0): + ret = True + frame_counter = 0 + while ret: + video.set(cv2.CAP_PROP_POS_FRAMES, frame_counter) + ret, frame = video.read() + if ret: + yield frame + frame_counter += interval + + +def contains_face_match(detector: cv2.FaceDetectorYN, recognizer: cv2.FaceRecognizerSF, image: numpy.ndarray, referance_features: list(), thresh: float) -> bool: + detector.setInputSize([image.shape[1], image.shape[0]]) + faces = detector.detect(image)[1] + if faces is None: + return 0, False + for face in faces: + cropped_image = recognizer.alignCrop(image, face) + features = recognizer.feature(cropped_image) + score_accum = 0.0 + for referance in referance_features: + score_accum += recognizer.match(referance, features, 0) + score = score_accum / len(referance_features) + if score > thresh: + return score, True + return 0, False + + +def process_referance(detector: cv2.FaceDetectorYN, recognizer: cv2.FaceRecognizerSF, referance_path: str) -> list(): + images = list() + out = list() + + if os.path.isfile(referance_path): + image = cv2.imread(referance_path) + if image is None: + print(f"Could not load image from {referance_path}") + else: + images.append(image) + elif os.path.isdir(referance_path): + filenames = find_image_files(referance_path) + images = list(image_loader(filenames)) + + for image in images: + detector.setInputSize([image.shape[1], image.shape[0]]) + faces = detector.detect(image)[1] + if faces is None: + print("unable to find face in referance image") + exit(1) + image = recognizer.alignCrop(image, faces[0]) + features = recognizer.feature(image) + out.append(features) + + return out + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Script to assemble a dataset of images of a specific person") + parser.add_argument('--out', '-o', default="out", help="place to put dataset") + parser.add_argument('--input', '-i', required=True, help="directory or video file to get images from") + parser.add_argument('--skip', '-s', default=0, type=int, help="skip n frames between samples when grabbing from a video file") + parser.add_argument('--referance', '-r', required=True, help="referance image or directory of images of the person to be found") + parser.add_argument('--match_model', '-m', required=True, help="Path to the onnx recognition model to be used") + parser.add_argument('--detect_model', '-d', required=True, help="Path to the onnx detection model to be used") + parser.add_argument('--threshold', '-t', default=0.362, type=float, help="match threshold to use") + parser.add_argument('--invert', '-n', action='store_true', help="output files that DONT match") + args = parser.parse_args() + + recognizer = cv2.FaceRecognizerSF.create(model=args.match_model, config="", backend_id=cv2.dnn.DNN_BACKEND_DEFAULT , target_id=cv2.dnn.DNN_TARGET_CPU) + detector = cv2.FaceDetectorYN.create(model=args.detect_model, config="", input_size=[320, 320], + score_threshold=0.6, nms_threshold=0.3, top_k=5000, backend_id=cv2.dnn.DNN_BACKEND_DEFAULT, target_id=cv2.dnn.DNN_TARGET_CPU) + + referance_features = process_referance(detector, recognizer, args.referance) + if len(referance_features) < 1: + print(f"Could not load any referance image(s) from {args.referance}") + exit(1) + + if os.path.isfile(args.input): + video = cv2.VideoCapture(args.input) + if not video.isOpened(): + print(f"Unable to open {args.input} as a video file") + exit(1) + image_generator = extract_video_images(video, args.skip + 1) + total_images = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) / (args.skip + 1) + elif os.path.isdir(args.input): + image_filenams = find_image_files(args.input) + image_generator = image_loader(image_filenams) + total_images = len(image_filenams) + else: + print(f"{args.input} is not a video file nor is it a directory") + exit(1) + + os.makedirs(args.out, exist_ok=True) + + progress = tqdm(total=int(total_images), desc="0.00") + counter = 0 + for image in image_generator: + if image.shape[0] > 512: + aspect = image.shape[0] / image.shape[1] + resized = cv2.resize(image, (int(512 / aspect), 512), 0, 0, cv2.INTER_AREA) + else: + resized = image + score, match = contains_face_match(detector, recognizer, resized, referance_features, args.threshold) + if match and not args.invert or not match and args.invert: + filename = f"{counter:04}.png" + cv2.imwrite(os.path.join(args.out, filename), image) + counter += 1 + progress.set_description(f"{score:1.2f}") + progress.update() + diff --git a/PersonDatasetAssembler/README.md b/PersonDatasetAssembler/README.md new file mode 100644 index 0000000..e6ec6b8 --- /dev/null +++ b/PersonDatasetAssembler/README.md @@ -0,0 +1,20 @@ +### PersonDatasetAssembler + +PersonDatasetAssembler is a python script that finds images of a spcific person, specified by a referance image in a directory of images or in a video file. PersonDatasetAssembler supports also raw images. + +## How to use + +first create a python venv and install the required packages into it: + + $ python -m venv venv + $ source venv/bin/activate + $ pip install -r requirements.txt + +Then run PersonDatasetAssembler for instance like so: + + $ python PersonDatasetAssembler.py --referance someperson.jpg --match_model ../Weights/face_recognition_sface_2021dec.onnx --detect_model ../Weights/face_detection_yunet_2023mar.onnx --input ~/Photos --out imagesOfSomePerson + +Or to extract images from a video: + + $ python PersonDatasetAssembler.py --referance someperson.jpg --match_model ../Weights/face_recognition_sface_2021dec.onnx --detect_model ../Weights/face_detection_yunet_2023mar.onnx -i ~/SomeVideo.mkv --out imagesOfSomePerson + diff --git a/PersonDatasetAssembler/requirements.txt b/PersonDatasetAssembler/requirements.txt new file mode 100644 index 0000000..7e67c26 --- /dev/null +++ b/PersonDatasetAssembler/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.26.4 +opencv-python==4.10.0.82 +tqdm==4.66.4 +Wand==0.6.13 diff --git a/README.md b/README.md new file mode 100644 index 0000000..9871c1e --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# SDImagePreprocess + +This repo contains a collection of high performance tools intended to ease the createion of datasets for image generation AI training like stable diffusion. + +## Included tools + +This repo contains the following tools: + +### SmartCrop + +SmartCrop is an application that uses content aware croping using, [seam carving](https://en.wikipedia.org/wiki/Seam_carving) and resizeing to bring a directory of images into the deisred size and aspect ratio for training. SmartCrop ist configurable to prioritize specific items or specifc persons in the images provided. + +#### Content detected in image: + +![Content found in image](SmartCrop/images/IMGP3692.jpg) + +#### Cropped image based on content: +![Cropped image](SmartCrop/images/IMGP3692C.jpg) + +### PersonDatasetAssembler + +PersonDatasetAssembler is a python script that finds images of a spcific person, specified by a referance image in a directory of images or in a video file. PersonDatasetAssembler supports also raw images. + +### LLavaTagger + +LLavaTagger is a python script that tags images based on a given prompt using the [LLaVA](https://llava-vl.github.io/) multi modal llm. LLavaTagger supports using any number of gpus in ddp parralel for this task. + +### DanbooruTagger + +DanbooruTagger is a python script of dubious utility that tags images based using the [DeepDanbooru](https://github.com/KichangKim/DeepDanbooru) convolutional network. + + +## License + +All files in this repo are litcenced GPL V3, see LICENSE diff --git a/SmartCrop/CMakeLists.txt b/SmartCrop/CMakeLists.txt new file mode 100644 index 0000000..e3787df --- /dev/null +++ b/SmartCrop/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.6) + +find_package(OpenCV REQUIRED) + +set(CMAKE_CXX_STANDARD 17) + +set(SRC_FILES main.cpp yolo.cpp tokenize.cpp log.cpp seamcarving.cpp utils.cpp intelligentroi.cpp facerecognizer.cpp) + +add_executable(smartcrop ${SRC_FILES}) +target_link_libraries(smartcrop ${OpenCV_LIBS} -ltbb) +target_include_directories(smartcrop PRIVATE ${OpenCV_INCLUDE_DIRS}) +target_compile_options(smartcrop PRIVATE -s -g -Wall) +message(WARNING ${WEIGHT_DIR}) +target_compile_definitions(smartcrop PUBLIC WEIGHT_DIR="${WEIGHT_DIR}") + +install(TARGETS smartcrop RUNTIME DESTINATION bin) diff --git a/SmartCrop/README.md b/SmartCrop/README.md new file mode 100644 index 0000000..7a216fe --- /dev/null +++ b/SmartCrop/README.md @@ -0,0 +1,50 @@ +# SmartCrop + +SmartCrop is an application that uses content aware croping using, [seam carving](https://en.wikipedia.org/wiki/Seam_carving) and resizeing to bring a directory of images into the deisred size and aspect ratio for training. SmartCrop ist configurable to prioritize specific items or specifc persons in the images provided. + +## Requirements + +* [cmake](https://cmake.org/) 3.6 or later +* [opencv](https://opencv.org/) 4.8 or later +* A c++17 capable compiler and standard lib like gcc or llvm/clang +* git is required to get the source + +## Building + +The steps to build this application are: + + $ git clone https://uvos.xyz/git/uvos/SDImagePreprocess.git + $ cd SDImagePreprocess + $ mkdir build + $ cmake .. + $ make + +The binary can then be found in build/SmartCrop and can optionaly be installed with: + + $ sudo make install + +## Basic usage + +To process all images in the directory ~/images and output the images into ~/proceesedImages: + + $ smartcrop --out processedImages ~/images/* + +To also focus on the person in the image ~/person.jpg + + $ smartcrop --out processedImages --focus-person ~/person.jpg ~/images/* + +To also enable seam carving + + $ smartcrop --out processedImages --focus-person ~/person.jpg --seam-carving ~/images/* + +see smartcrop --help for more + +## Example + +#### Content detected in image: +![Content found in image](images/IMGP3692.jpg) + +#### Cropped image based on content: +![Cropped image](images/IMGP3692C.jpg) + + diff --git a/SmartCrop/facerecognizer.cpp b/SmartCrop/facerecognizer.cpp new file mode 100644 index 0000000..623bfb2 --- /dev/null +++ b/SmartCrop/facerecognizer.cpp @@ -0,0 +1,163 @@ +// +// SmartCrop - A tool for content aware croping of images +// Copyright (C) 2024 Carl Philipp Klemm +// +// This file is part of SmartCrop. +// +// SmartCrop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SmartCrop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SmartCrop. If not, see . +// + +#include "facerecognizer.h" +#include + +#define INCBIN_PREFIX r +#include "incbin.h" + +INCBIN(defaultRecognizer, WEIGHT_DIR "/face_recognition_sface_2021dec.onnx"); +INCBIN(defaultDetector, WEIGHT_DIR "/face_detection_yunet_2023mar.onnx"); + +#include +#include +#include +#include + +#include "log.h" + +static const std::vector onnx((unsigned char*)rdefaultDetectorData, ((unsigned char*)rdefaultDetectorData)+rdefaultDetectorSize); + +FaceRecognizer::FaceRecognizer(std::filesystem::path recognizerPath, const std::filesystem::path& detectorPath, const std::vector& referances) +{ + if(detectorPath.empty()) + { + Log(Log::INFO)<<"Using builtin face detection model"; + + detector = cv::FaceDetectorYN::create("onnx", onnx, std::vector(), {320, 320}, 0.6, 0.3, 5000, cv::dnn::Backend::DNN_BACKEND_OPENCV, cv::dnn::Target::DNN_TARGET_CPU); + if(!detector) + throw LoadException("Unable to load detector network from built in file"); + } + else + { + detector = cv::FaceDetectorYN::create(detectorPath, "", {320, 320}, 0.6, 0.3, 5000, cv::dnn::Backend::DNN_BACKEND_OPENCV, cv::dnn::Target::DNN_TARGET_CPU); + if(!detector) + throw LoadException("Unable to load detector network from "+detectorPath.string()); + } + + bool defaultNetwork = recognizerPath.empty(); + + if(defaultNetwork) + { + Log(Log::INFO)<<"Using builtin face recognition model"; + recognizerPath = cv::tempfile("onnx"); + std::ofstream file(recognizerPath); + if(!file.is_open()) + throw LoadException("Unable open temporary file at "+recognizerPath.string()); + Log(Log::DEBUG)<<"Using "<(rdefaultRecognizerData), rdefaultRecognizerSize); + file.close(); + } + + recognizer = cv::FaceRecognizerSF::create(recognizerPath.string(), "", cv::dnn::Backend::DNN_BACKEND_OPENCV, cv::dnn::Target::DNN_TARGET_CPU); + + if(defaultNetwork) + std::filesystem::remove(recognizerPath); + + if(!recognizer) + throw LoadException("Unable to load recognizer network from "+recognizerPath.string()); + + addReferances(referances); +} + +cv::Mat FaceRecognizer::detectFaces(const cv::Mat& input) +{ + detector->setInputSize(input.size()); + cv::Mat faces; + detector->detect(input, faces); + return faces; +} + +bool FaceRecognizer::addReferances(const std::vector& referances) +{ + bool ret = false; + for(const cv::Mat& image : referances) + { + cv::Mat faces = detectFaces(image); + assert(faces.cols == 15); + if(faces.empty()) + { + Log(Log::WARN)<<"A referance image provided dose not contian any face"; + continue; + } + if(faces.rows > 1) + Log(Log::WARN)<<"A referance image provided contains more than one face, only the first detected face will be considered"; + cv::Mat cropedImage; + recognizer->alignCrop(image, faces.row(0), cropedImage); + cv::Mat features; + recognizer->feature(cropedImage, features); + referanceFeatures.push_back(features.clone()); + ret = true; + } + + return ret; +} + +void FaceRecognizer::setThreshold(double threasholdIn) +{ + threshold = threasholdIn; +} + +double FaceRecognizer::getThreshold() +{ + return threshold; +} + +void FaceRecognizer::clearReferances() +{ + referanceFeatures.clear(); +} + +FaceRecognizer::Detection FaceRecognizer::isMatch(const cv::Mat& input, bool alone) +{ + cv::Mat faces = detectFaces(input); + + Detection bestMatch; + bestMatch.confidence = 0; + bestMatch.person = -1; + + if(alone && faces.rows > 1) + { + bestMatch.person = -2; + return bestMatch; + } + + for(int i = 0; i < faces.rows; ++i) + { + cv::Mat face; + recognizer->alignCrop(input, faces.row(i), face); + cv::Mat features; + recognizer->feature(face, features); + features = features.clone(); + for(size_t referanceIndex = 0; referanceIndex < referanceFeatures.size(); ++referanceIndex) + { + double score = recognizer->match(referanceFeatures[referanceIndex], features, cv::FaceRecognizerSF::FR_COSINE); + if(score > threshold && score > bestMatch.confidence) + { + bestMatch.confidence = score; + bestMatch.person = referanceIndex; + bestMatch.rect = cv::Rect(faces.at(i, 0), faces.at(i, 1), faces.at(i, 2), faces.at(i, 3)); + } + } + } + + return bestMatch; +} diff --git a/SmartCrop/facerecognizer.h b/SmartCrop/facerecognizer.h new file mode 100644 index 0000000..8b28613 --- /dev/null +++ b/SmartCrop/facerecognizer.h @@ -0,0 +1,67 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +class FaceRecognizer +{ +public: + + struct Detection + { + int person; + float confidence; + cv::Rect rect; + }; + + class LoadException : public std::exception + { + private: + std::string message; + public: + LoadException(const std::string& msg): std::exception(), message(msg) {} + virtual const char* what() const throw() override + { + return message.c_str(); + } + }; + +private: + std::vector referanceFeatures; + std::shared_ptr recognizer; + std::shared_ptr detector; + + double threshold = 0.363; + +public: + FaceRecognizer(std::filesystem::path recognizerPath = "", const std::filesystem::path& detectorPath = "", const std::vector& referances = std::vector()); + cv::Mat detectFaces(const cv::Mat& input); + Detection isMatch(const cv::Mat& input, bool alone = false); + bool addReferances(const std::vector& referances); + void setThreshold(double threashold); + double getThreshold(); + void clearReferances(); +}; diff --git a/SmartCrop/images/IMGP3692.jpg b/SmartCrop/images/IMGP3692.jpg new file mode 100644 index 0000000..f22fbd9 Binary files /dev/null and b/SmartCrop/images/IMGP3692.jpg differ diff --git a/SmartCrop/images/IMGP3692C.jpg b/SmartCrop/images/IMGP3692C.jpg new file mode 100644 index 0000000..8f1f41e Binary files /dev/null and b/SmartCrop/images/IMGP3692C.jpg differ diff --git a/SmartCrop/incbin.h b/SmartCrop/incbin.h new file mode 100644 index 0000000..502862f --- /dev/null +++ b/SmartCrop/incbin.h @@ -0,0 +1,495 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +/** + * @file incbin.h + * @author Dale Weiler + * @brief Utility for including binary files + * + * Facilities for including binary files into the current translation unit and + * making use from them externally in other translation units. + */ +#ifndef INCBIN_HDR +#define INCBIN_HDR +#include +#if defined(__AVX512BW__) || \ + defined(__AVX512CD__) || \ + defined(__AVX512DQ__) || \ + defined(__AVX512ER__) || \ + defined(__AVX512PF__) || \ + defined(__AVX512VL__) || \ + defined(__AVX512F__) +# define INCBIN_ALIGNMENT_INDEX 6 +#elif defined(__AVX__) || \ + defined(__AVX2__) +# define INCBIN_ALIGNMENT_INDEX 5 +#elif defined(__SSE__) || \ + defined(__SSE2__) || \ + defined(__SSE3__) || \ + defined(__SSSE3__) || \ + defined(__SSE4_1__) || \ + defined(__SSE4_2__) || \ + defined(__neon__) || \ + defined(__ARM_NEON) || \ + defined(__ALTIVEC__) +# define INCBIN_ALIGNMENT_INDEX 4 +#elif ULONG_MAX != 0xffffffffu +# define INCBIN_ALIGNMENT_INDEX 3 +# else +# define INCBIN_ALIGNMENT_INDEX 2 +#endif + +/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */ +#define INCBIN_ALIGN_SHIFT_0 1 +#define INCBIN_ALIGN_SHIFT_1 2 +#define INCBIN_ALIGN_SHIFT_2 4 +#define INCBIN_ALIGN_SHIFT_3 8 +#define INCBIN_ALIGN_SHIFT_4 16 +#define INCBIN_ALIGN_SHIFT_5 32 +#define INCBIN_ALIGN_SHIFT_6 64 + +/* Actual alignment value */ +#define INCBIN_ALIGNMENT \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \ + INCBIN_ALIGNMENT_INDEX) + +/* Stringize */ +#define INCBIN_STR(X) \ + #X +#define INCBIN_STRINGIZE(X) \ + INCBIN_STR(X) +/* Concatenate */ +#define INCBIN_CAT(X, Y) \ + X ## Y +#define INCBIN_CONCATENATE(X, Y) \ + INCBIN_CAT(X, Y) +/* Deferred macro expansion */ +#define INCBIN_EVAL(X) \ + X +#define INCBIN_INVOKE(N, ...) \ + INCBIN_EVAL(N(__VA_ARGS__)) +/* Variable argument count for overloading by arity */ +#define INCBIN_VA_ARG_COUNTER(_1, _2, _3, N, ...) N +#define INCBIN_VA_ARGC(...) INCBIN_VA_ARG_COUNTER(__VA_ARGS__, 3, 2, 1, 0) + +/* Green Hills uses a different directive for including binary data */ +#if defined(__ghs__) +# if (__ghs_asm == 2) +# define INCBIN_MACRO ".file" +/* Or consider the ".myrawdata" entry in the ld file */ +# else +# define INCBIN_MACRO "\tINCBIN" +# endif +#else +# define INCBIN_MACRO ".incbin" +#endif + +#ifndef _MSC_VER +# define INCBIN_ALIGN \ + __attribute__((aligned(INCBIN_ALIGNMENT))) +#else +# define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT)) +#endif + +#if defined(__arm__) || /* GNU C and RealView */ \ + defined(__arm) || /* Diab */ \ + defined(_ARM) /* ImageCraft */ +# define INCBIN_ARM +#endif + +#ifdef __GNUC__ +/* Utilize .balign where supported */ +# define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n" +# define INCBIN_ALIGN_BYTE ".balign 1\n" +#elif defined(INCBIN_ARM) +/* + * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is + * the shift count. This is the value passed to `.align' + */ +# define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n" +# define INCBIN_ALIGN_BYTE ".align 0\n" +#else +/* We assume other inline assembler's treat `.align' as `.balign' */ +# define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n" +# define INCBIN_ALIGN_BYTE ".align 1\n" +#endif + +/* INCBIN_CONST is used by incbin.c generated files */ +#if defined(__cplusplus) +# define INCBIN_EXTERNAL extern "C" +# define INCBIN_CONST extern const +#else +# define INCBIN_EXTERNAL extern +# define INCBIN_CONST const +#endif + +/** + * @brief Optionally override the linker section into which size and data is + * emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + */ +#if !defined(INCBIN_OUTPUT_SECTION) +# if defined(__APPLE__) +# define INCBIN_OUTPUT_SECTION ".const_data" +# else +# define INCBIN_OUTPUT_SECTION ".rodata" +# endif +#endif + +/** + * @brief Optionally override the linker section into which data is emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + */ +#if !defined(INCBIN_OUTPUT_DATA_SECTION) +# define INCBIN_OUTPUT_DATA_SECTION INCBIN_OUTPUT_SECTION +#endif + +/** + * @brief Optionally override the linker section into which size is emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + * + * @note This is useful for Harvard architectures where program memory cannot + * be directly read from the program without special instructions. With this you + * can chose to put the size variable in RAM rather than ROM. + */ +#if !defined(INCBIN_OUTPUT_SIZE_SECTION) +# define INCBIN_OUTPUT_SIZE_SECTION INCBIN_OUTPUT_SECTION +#endif + +#if defined(__APPLE__) +# include "TargetConditionals.h" +# if defined(TARGET_OS_IPHONE) && !defined(INCBIN_SILENCE_BITCODE_WARNING) +# warning "incbin is incompatible with bitcode. Using the library will break upload to App Store if you have bitcode enabled. Add `#define INCBIN_SILENCE_BITCODE_WARNING` before including this header to silence this warning." +# endif +/* The directives are different for Apple branded compilers */ +# define INCBIN_SECTION INCBIN_OUTPUT_SECTION "\n" +# define INCBIN_GLOBAL(NAME) ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" +# define INCBIN_INT ".long " +# define INCBIN_MANGLE "_" +# define INCBIN_BYTE ".byte " +# define INCBIN_TYPE(...) +#else +# define INCBIN_SECTION ".section " INCBIN_OUTPUT_SECTION "\n" +# define INCBIN_GLOBAL(NAME) ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" +# if defined(__ghs__) +# define INCBIN_INT ".word " +# else +# define INCBIN_INT ".int " +# endif +# if defined(__USER_LABEL_PREFIX__) +# define INCBIN_MANGLE INCBIN_STRINGIZE(__USER_LABEL_PREFIX__) +# else +# define INCBIN_MANGLE "" +# endif +# if defined(INCBIN_ARM) +/* On arm assemblers, `@' is used as a line comment token */ +# define INCBIN_TYPE(NAME) ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n" +# elif defined(__MINGW32__) || defined(__MINGW64__) +/* Mingw doesn't support this directive either */ +# define INCBIN_TYPE(NAME) +# else +/* It's safe to use `@' on other architectures */ +# define INCBIN_TYPE(NAME) ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n" +# endif +# define INCBIN_BYTE ".byte " +#endif + +/* List of style types used for symbol names */ +#define INCBIN_STYLE_CAMEL 0 +#define INCBIN_STYLE_SNAKE 1 + +/** + * @brief Specify the prefix to use for symbol names. + * + * @note By default this is "g". + * + * @code + * #define INCBIN_PREFIX incbin + * #include "incbin.h" + * INCBIN(Foo, "foo.txt"); + * + * // Now you have the following symbols instead: + * // const unsigned char incbinFoo[]; + * // const unsigned char *const incbinFoo; + * // const unsigned int incbinFoo; + * @endcode + */ +#if !defined(INCBIN_PREFIX) +# define INCBIN_PREFIX g +#endif + +/** + * @brief Specify the style used for symbol names. + * + * Possible options are + * - INCBIN_STYLE_CAMEL "CamelCase" + * - INCBIN_STYLE_SNAKE "snake_case" + * + * @note By default this is INCBIN_STYLE_CAMEL + * + * @code + * #define INCBIN_STYLE INCBIN_STYLE_SNAKE + * #include "incbin.h" + * INCBIN(foo, "foo.txt"); + * + * // Now you have the following symbols: + * // const unsigned char foo_data[]; + * // const unsigned char *const foo_end; + * // const unsigned int foo_size; + * @endcode + */ +#if !defined(INCBIN_STYLE) +# define INCBIN_STYLE INCBIN_STYLE_CAMEL +#endif + +/* Style lookup tables */ +#define INCBIN_STYLE_0_DATA Data +#define INCBIN_STYLE_0_END End +#define INCBIN_STYLE_0_SIZE Size +#define INCBIN_STYLE_1_DATA _data +#define INCBIN_STYLE_1_END _end +#define INCBIN_STYLE_1_SIZE _size + +/* Style lookup: returning identifier */ +#define INCBIN_STYLE_IDENT(TYPE) \ + INCBIN_CONCATENATE( \ + INCBIN_STYLE_, \ + INCBIN_CONCATENATE( \ + INCBIN_EVAL(INCBIN_STYLE), \ + INCBIN_CONCATENATE(_, TYPE))) + +/* Style lookup: returning string literal */ +#define INCBIN_STYLE_STRING(TYPE) \ + INCBIN_STRINGIZE( \ + INCBIN_STYLE_IDENT(TYPE)) \ + +/* Generate the global labels by indirectly invoking the macro with our style + * type and concatenating the name against them. */ +#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \ + INCBIN_INVOKE( \ + INCBIN_GLOBAL, \ + INCBIN_CONCATENATE( \ + NAME, \ + INCBIN_INVOKE( \ + INCBIN_STYLE_IDENT, \ + TYPE))) \ + INCBIN_INVOKE( \ + INCBIN_TYPE, \ + INCBIN_CONCATENATE( \ + NAME, \ + INCBIN_INVOKE( \ + INCBIN_STYLE_IDENT, \ + TYPE))) + +/** + * @brief Externally reference binary data included in another translation unit. + * + * Produces three external symbols that reference the binary data included in + * another translation unit. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`. + * @param NAME The name given for the binary data + * + * @code + * INCBIN_EXTERN(Foo); + * + * // Now you have the following symbols: + * // extern const unsigned char Foo[]; + * // extern const unsigned char *const Foo; + * // extern const unsigned int Foo; + * @endcode + * + * You may specify a custom optional data type as well as the first argument. + * @code + * INCBIN_EXTERN(custom_type, Foo); + * + * // Now you have the following symbols: + * // extern const custom_type Foo[]; + * // extern const custom_type *const Foo; + * // extern const unsigned int Foo; + * @endcode + */ +#define INCBIN_EXTERN(...) \ + INCBIN_CONCATENATE(INCBIN_EXTERN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__) +#define INCBIN_EXTERN_1(NAME, ...) \ + INCBIN_EXTERN_2(unsigned char, NAME) +#define INCBIN_EXTERN_2(TYPE, NAME) \ + INCBIN_EXTERNAL const INCBIN_ALIGN TYPE \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(DATA))[]; \ + INCBIN_EXTERNAL const INCBIN_ALIGN TYPE *const \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(END)); \ + INCBIN_EXTERNAL const unsigned int \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(SIZE)) + +/** + * @brief Externally reference textual data included in another translation unit. + * + * Produces three external symbols that reference the textual data included in + * another translation unit. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param NAME The name given for the textual data + * + * @code + * INCBIN_EXTERN(Foo); + * + * // Now you have the following symbols: + * // extern const char Foo[]; + * // extern const char *const Foo; + * // extern const unsigned int Foo; + * @endcode + */ +#define INCTXT_EXTERN(NAME) \ + INCBIN_EXTERN_2(char, NAME) + +/** + * @brief Include a binary file into the current translation unit. + * + * Includes a binary file into the current translation unit, producing three symbols + * for objects that encode the data and size respectively. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`. + * @param NAME The name to associate with this binary data (as an identifier.) + * @param FILENAME The file to include (as a string literal.) + * + * @code + * INCBIN(Icon, "icon.png"); + * + * // Now you have the following symbols: + * // const unsigned char Icon[]; + * // const unsigned char *const Icon; + * // const unsigned int Icon; + * @endcode + * + * You may specify a custom optional data type as well as the first argument. + * These macros are specialized by arity. + * @code + * INCBIN(custom_type, Icon, "icon.png"); + * + * // Now you have the following symbols: + * // const custom_type Icon[]; + * // const custom_type *const Icon; + * // const unsigned int Icon; + * @endcode + * + * @warning This must be used in global scope + * @warning The identifiers may be different if INCBIN_STYLE is not default + * + * To externally reference the data included by this in another translation unit + * please @see INCBIN_EXTERN. + */ +#ifdef _MSC_VER +# define INCBIN(NAME, FILENAME) \ + INCBIN_EXTERN(NAME) +#else +# define INCBIN(...) \ + INCBIN_CONCATENATE(INCBIN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__) +# if defined(__GNUC__) +# define INCBIN_1(...) _Pragma("GCC error \"Single argument INCBIN not allowed\"") +# elif defined(__clang__) +# define INCBIN_1(...) _Pragma("clang error \"Single argument INCBIN not allowed\"") +# else +# define INCBIN_1(...) /* Cannot do anything here */ +# endif +# define INCBIN_2(NAME, FILENAME) \ + INCBIN_3(unsigned char, NAME, FILENAME) +# define INCBIN_3(TYPE, NAME, FILENAME) INCBIN_COMMON(TYPE, NAME, FILENAME, /* No terminator for binary data */) +# define INCBIN_COMMON(TYPE, NAME, FILENAME, TERMINATOR) \ + __asm__(INCBIN_SECTION \ + INCBIN_GLOBAL_LABELS(NAME, DATA) \ + INCBIN_ALIGN_HOST \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \ + INCBIN_MACRO " \"" FILENAME "\"\n" \ + TERMINATOR \ + INCBIN_GLOBAL_LABELS(NAME, END) \ + INCBIN_ALIGN_BYTE \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \ + INCBIN_BYTE "1\n" \ + INCBIN_GLOBAL_LABELS(NAME, SIZE) \ + INCBIN_ALIGN_HOST \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \ + INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \ + INCBIN_ALIGN_HOST \ + ".text\n" \ + ); \ + INCBIN_EXTERN(TYPE, NAME) +#endif + +/** + * @brief Include a textual file into the current translation unit. + * + * This behaves the same as INCBIN except it produces char compatible arrays + * and implicitly adds a null-terminator byte, thus the size of data included + * by this is one byte larger than that of INCBIN. + * + * Includes a textual file into the current translation unit, producing three + * symbols for objects that encode the data and size respectively. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param NAME The name to associate with this binary data (as an identifier.) + * @param FILENAME The file to include (as a string literal.) + * + * @code + * INCTXT(Readme, "readme.txt"); + * + * // Now you have the following symbols: + * // const char Readme[]; + * // const char *const Readme; + * // const unsigned int Readme; + * @endcode + * + * @warning This must be used in global scope + * @warning The identifiers may be different if INCBIN_STYLE is not default + * + * To externally reference the data included by this in another translation unit + * please @see INCBIN_EXTERN. + */ +#if defined(_MSC_VER) +# define INCTXT(NAME, FILENAME) \ + INCBIN_EXTERN(NAME) +#else +# define INCTXT(NAME, FILENAME) \ + INCBIN_COMMON(char, NAME, FILENAME, INCBIN_BYTE "0\n") +#endif + +#endif \ No newline at end of file diff --git a/SmartCrop/intelligentroi.cpp b/SmartCrop/intelligentroi.cpp new file mode 100644 index 0000000..394509c --- /dev/null +++ b/SmartCrop/intelligentroi.cpp @@ -0,0 +1,128 @@ +// +// SmartCrop - A tool for content aware croping of images +// Copyright (C) 2024 Carl Philipp Klemm +// +// This file is part of SmartCrop. +// +// SmartCrop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SmartCrop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SmartCrop. If not, see . +// + +#include "intelligentroi.h" + +#include + +#include "utils.h" +#include "log.h" + +bool InteligentRoi::compPointPrio(const std::pair& a, const std::pair& b, const cv::Point2i& center) +{ + if(a.second != b.second) + return a.second > b.second; + + double distA = pointDist(a.first, center); + double distB = pointDist(b.first, center); + + return distA < distB; +} + +void InteligentRoi::slideRectToPoint(cv::Rect& rect, const cv::Point2i& point) +{ + if(!pointInRect(point, rect)) + { + if(point.x < rect.x) + rect.x = point.x; + else if(point.x > rect.x+rect.width) + rect.x = point.x-rect.width; + if(point.y < rect.y) + rect.y = point.y; + else if(point.y > rect.y+rect.height) + rect.y = point.y-rect.height; + } +} + +cv::Rect InteligentRoi::maxRect(bool& incompleate, const cv::Size2i& imageSize, std::vector> mustInclude) +{ + incompleate = false; + int diameter = std::min(imageSize.height, imageSize.width); + cv::Point2i point(imageSize.width/2, imageSize.height/2); + cv::Rect candiate(point.x-diameter/2, point.y-diameter/2, diameter, diameter); + + std::sort(mustInclude.begin(), mustInclude.end(), + [&point](const std::pair& a, const std::pair& b){return compPointPrio(a, b, point);}); + + while(true) + { + cv::Rect includeRect = rectFromPoints(mustInclude); + if(includeRect.width-2 > diameter || includeRect.height-2 > diameter) + { + incompleate = true; + slideRectToPoint(candiate, mustInclude.back().first); + mustInclude.pop_back(); + Log(Log::DEBUG)<<"cant fill"; + for(const std::pair& mipoint : mustInclude) + Log(Log::DEBUG)<& includePoint : mustInclude) + slideRectToPoint(candiate, includePoint.first); + + if(candiate.x < 0) + candiate.x = 0; + if(candiate.y < 0) + candiate.y = 0; + if(candiate.x+candiate.width > imageSize.width) + candiate.width = imageSize.width-candiate.x; + if(candiate.y+candiate.height > imageSize.height) + candiate.height = imageSize.height-candiate.y; + + return candiate; +} + +InteligentRoi::InteligentRoi(const Yolo& yolo) +{ + personId = yolo.getClassForStr("person"); +} + +bool InteligentRoi::getCropRectangle(cv::Rect& out, const std::vector& detections, const cv::Size2i& imageSize) +{ + std::vector> corners; + for(size_t i = 0; i < detections.size(); ++i) + { + int priority = detections[i].priority; + if(detections[i].class_id == personId) + { + corners.push_back({detections[i].box.tl()+cv::Point2i(detections[i].box.width/2, 0), priority+2}); + corners.push_back({detections[i].box.tl(), priority+1}); + corners.push_back({detections[i].box.br(), priority}); + corners.push_back({detections[i].box.tl()+cv::Point2i(detections[i].box.width, 0), priority+1}); + corners.push_back({detections[i].box.br()+cv::Point2i(0-detections[i].box.width, 0), priority}); + } + else + { + corners.push_back({detections[i].box.tl(), priority}); + corners.push_back({detections[i].box.br(), priority}); + corners.push_back({detections[i].box.tl()+cv::Point2i(detections[i].box.width, 0), priority}); + corners.push_back({detections[i].box.br()+cv::Point2i(0-detections[i].box.width, 0), priority}); + } + } + + bool incompleate; + out = maxRect(incompleate, imageSize, corners); + return incompleate; +} diff --git a/SmartCrop/intelligentroi.h b/SmartCrop/intelligentroi.h new file mode 100644 index 0000000..0cbb00b --- /dev/null +++ b/SmartCrop/intelligentroi.h @@ -0,0 +1,37 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once + +#include + +#include "yolo.h" + +class InteligentRoi +{ +private: + int personId; + static bool compPointPrio(const std::pair& a, const std::pair& b, const cv::Point2i& center); + static void slideRectToPoint(cv::Rect& rect, const cv::Point2i& point); + static cv::Rect maxRect(bool& incompleate, const cv::Size2i& imageSize, std::vector> mustInclude = {}); + +public: + InteligentRoi(const Yolo& yolo); + bool getCropRectangle(cv::Rect& out, const std::vector& detections, const cv::Size2i& imageSize); +}; diff --git a/SmartCrop/log.cpp b/SmartCrop/log.cpp new file mode 100644 index 0000000..61fa188 --- /dev/null +++ b/SmartCrop/log.cpp @@ -0,0 +1,63 @@ +/** +* Lubricant Detecter +* Copyright (C) 2021 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#include "log.h" + +Log::Log(Level type, bool endlineI): endline(endlineI) +{ + msglevel = type; + if(headers) + { + operator << ("["+getLabel(type)+"] "); + } +} + +Log::~Log() +{ + if(opened && endline) + { + std::cout<<'\n'; + } + opened = false; +} + + +std::string Log::getLabel(Level level) +{ + std::string label; + switch(level) + { + case DEBUG: + label = "DEBUG"; + break; + case INFO: + label = "INFO "; + break; + case WARN: + label = "WARN "; + break; + case ERROR: + label = "ERROR"; + break; + } + return label; +} + +bool Log::headers = false; +Log::Level Log::level = WARN; diff --git a/SmartCrop/log.h b/SmartCrop/log.h new file mode 100644 index 0000000..c0d90eb --- /dev/null +++ b/SmartCrop/log.h @@ -0,0 +1,64 @@ +/** +* eisgenerator +* Copyright (C) 2021 Carl Klemm +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* version 3 as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the +* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +* Boston, MA 02110-1301, USA. +*/ + +#pragma once +#include +#include + +class Log +{ +public: + + enum Level + { + DEBUG, + INFO, + WARN, + ERROR + }; + +private: + bool opened = false; + Level msglevel = DEBUG; + bool endline = true; + + std::string getLabel(Level level); + +public: + + static bool headers; + static Level level; + + Log() {} + Log(Level type, bool endlineI = true); + ~Log(); + + template Log &operator<<(const T &msg) + { + if(msglevel >= level) + { + if(msglevel == ERROR) + std::cerr<. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "yolo.h" +#include "log.h" +#include "options.h" +#include "utils.h" +#include "intelligentroi.h" +#include "seamcarving.h" +#include "facerecognizer.h" + +const Yolo::Detection* pointInDetectionHoriz(int x, const std::vector& detections, const Yolo::Detection* ignore = nullptr) +{ + const Yolo::Detection* inDetection = nullptr; + for(const Yolo::Detection& detection : detections) + { + if(ignore && ignore == &detection) + continue; + + if(detection.box.x <= x && detection.box.x+detection.box.width >= x) + { + if(!inDetection || detection.box.br().x > inDetection->box.br().x) + inDetection = &detection; + } + } + return inDetection; +} + +bool findRegionEndpointHoriz(int& x, const std::vector& detections, int imgSizeX) +{ + const Yolo::Detection* inDetection = pointInDetectionHoriz(x, detections); + + Log(Log::DEBUG, false)<<__func__<<" point "< x) + { + if(closest == nullptr || detection.box.x-x > closest->box.x-x) + closest = &detection; + } + } + if(closest) + x = closest->box.x; + else + x = imgSizeX; + + Log(Log::DEBUG)<<" is not in any box and will be moved to "<className : "null")<<") is"; + return false; + } + else + { + x = inDetection->box.br().x; + Log(Log::DEBUG, false)<<" is in a box and will be moved to its end "<box.br().x > x) + { + Log(Log::DEBUG)<<"it is again in a box"; + return findRegionEndpointHoriz(x, detections, imgSizeX); + } + else + { + Log(Log::DEBUG)<<"it is not in a box"; + return true; + } + } +} + +std::vector> cutImageIntoHorzRegions(cv::Mat& image, const std::vector& detections) +{ + std::vector> out; + + std::cout<<__func__<<' '<>& slices) +{ + assert(!slices.empty()); + + int cols = 0; + for(const std::pair& slice : slices) + cols += slice.first.cols; + + + cv::Mat image(cols, slices[0].first.rows, slices[0].first.type()); + Log(Log::DEBUG)<<__func__<<' '<& slice : slices) + { + cv::Rect rect(col, 0, slice.first.cols, slice.first.rows); + Log(Log::DEBUG)<<__func__<<' '< detections, double targetAspectRatio = 1.0) +{ + detections.erase(std::remove_if(detections.begin(), detections.end(), [](const Yolo::Detection& detection){return detection.priority < 3;}), detections.end()); + + double aspectRatio = image.cols/static_cast(image.rows); + + Log(Log::DEBUG)<<"Image size "< targetAspectRatio) + vertical = true; + + int requiredLines = 0; + if(!vertical) + requiredLines = image.rows*targetAspectRatio - image.cols; + else + requiredLines = image.cols/targetAspectRatio - image.rows; + + Log(Log::DEBUG)<<__func__<<' '<> slices = cutImageIntoHorzRegions(image, detections); + Log(Log::DEBUG)<<"Image has "<& slice : slices) + { + Log(Log::DEBUG)<<"a "<<(slice.second ? "frozen" : "unfrozen")<<" slice of size "< seamsForSlice(slices.size(), 0); + for(size_t i = 0; i < slices.size(); ++i) + { + if(!slices[i].second) + seamsForSlice[i] = (static_cast(slices[i].first.cols)/totalResizableSize)*requiredLines; + } + + int residual = requiredLines - std::accumulate(seamsForSlice.begin(), seamsForSlice.end(), decltype(seamsForSlice)::value_type(0));; + for(ssize_t i = slices.size()-1; i >= 0; --i) + { + if(!slices[i].second) + { + seamsForSlice[i] += residual; + break; + } + } + + for(size_t i = 0; i < slices.size(); ++i) + { + if(seamsForSlice[i] != 0) + { + bool ret = SeamCarving::strechImage(slices[i].first, seamsForSlice[i], true); + if(!ret) + { + if(vertical) + transpose(image, image); + return false; + } + } + } + + image = assembleFromSlicesHoriz(slices); + + if(vertical) + cv::transpose(image, image); + + return true; +} + +void drawDebugInfo(cv::Mat &image, const cv::Rect& rect, const std::vector& detections) +{ + for(const Yolo::Detection& detection : detections) + { + cv::rectangle(image, detection.box, detection.color, 3); + std::string label = detection.className + ' ' + std::to_string(detection.confidence).substr(0, 4) + ' ' + std::to_string(detection.priority); + cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_DUPLEX, 1, 1, 0); + cv::Rect textBox(detection.box.x, detection.box.y - 40, labelSize.width + 10, labelSize.height + 20); + cv::rectangle(image, textBox, detection.color, cv::FILLED); + cv::putText(image, label, cv::Point(detection.box.x + 5, detection.box.y - 10), cv::FONT_HERSHEY_DUPLEX, 1, cv::Scalar(0, 0, 0), 1, 0); + } + + cv::rectangle(image, rect, cv::Scalar(0, 0, 255), 8); +} + +static void reduceSize(cv::Mat& image, const cv::Size& targetSize) +{ + int longTargetSize = std::max(targetSize.width, targetSize.height)*2; + if(std::max(image.cols, image.rows) > longTargetSize) + { + if(image.cols > image.rows) + { + double ratio = static_cast(longTargetSize)/image.cols; + cv::resize(image, image, {longTargetSize, static_cast(image.rows*ratio)}, 0, 0, ratio < 1 ? cv::INTER_AREA : cv::INTER_CUBIC); + } + else + { + double ratio = static_cast(longTargetSize)/image.rows; + cv::resize(image, image, {static_cast(image.cols*ratio), longTargetSize}, 0, 0, ratio < 1 ? cv::INTER_AREA : cv::INTER_CUBIC); + } + } +} + +void pipeline(const std::filesystem::path& path, const Config& config, Yolo& yolo, FaceRecognizer* recognizer, + std::mutex& reconizerMutex, const std::filesystem::path& debugOutputPath) +{ + InteligentRoi intRoi(yolo); + cv::Mat image = cv::imread(path); + if(!image.data) + { + Log(Log::WARN)<<"could not load image "< detections = yolo.runInference(image); + + Log(Log::DEBUG)<<"Got "<isMatch(person); + reconizerMutex.unlock(); + if(match.person >= 0) + { + detection.priority += 10; + hasmatch = true; + detections.push_back({0, "Face", match.confidence, 20, {255, 0, 0}, match.rect}); + } + } + Log(Log::DEBUG)<& images, const Config& config, FaceRecognizer* recognizer, + std::mutex& reconizerMutex, const std::filesystem::path& debugOutputPath) +{ + Yolo yolo(config.modelPath, {640, 480}, config.classesPath, false); + for(std::filesystem::path path : images) + pipeline(path, config, yolo, recognizer, reconizerMutex, debugOutputPath); +} + +template +std::vector> splitVector(const std::vector& vec, size_t parts) +{ + std::vector> out; + + size_t length = vec.size()/parts; + size_t remain = vec.size() % parts; + + size_t begin = 0; + size_t end = 0; + + for (size_t i = 0; i < std::min(parts, vec.size()); ++i) + { + end += (remain > 0) ? (length + !!(remain--)) : length; + out.push_back(std::vector(vec.begin() + begin, vec.begin() + end)); + begin = end; + } + + return out; +} + +int main(int argc, char* argv[]) +{ + Log::level = Log::INFO; + + Config config; + argp_parse(&argp, argc, argv, 0, 0, &config); + + if(config.outputDir.empty()) + { + Log(Log::ERROR)<<"a output path \"-o\" is required"; + return 1; + } + + if(config.imagePaths.empty()) + { + Log(Log::ERROR)<<"at least one input image or directory is required"; + return 1; + } + + std::vector imagePaths; + + for(const std::filesystem::path& path : config.imagePaths) + getImageFiles(path, imagePaths); + + Log(Log::DEBUG)<<"Images:"; + for(const::std::filesystem::path& path: imagePaths) + Log(Log::DEBUG)<addReferances({personImage}); + recognizer->setThreshold(config.threshold); + } + + std::vector threads; + std::vector> imagePathParts = splitVector(imagePaths, std::thread::hardware_concurrency()); + + for(size_t i = 0; i < imagePathParts.size(); ++i) + threads.push_back(std::thread(threadFn, imagePathParts[i], std::ref(config), recognizer, std::ref(recognizerMutex), std::ref(debugOutputPath))); + + for(std::thread& thread : threads) + thread.join(); + + return 0; +} diff --git a/SmartCrop/options.h b/SmartCrop/options.h new file mode 100644 index 0000000..38c2840 --- /dev/null +++ b/SmartCrop/options.h @@ -0,0 +1,117 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include "log.h" + +const char *argp_program_version = "AIImagePreprocesses"; +const char *argp_program_bug_address = ""; +static char doc[] = "Application that trainsforms images into formats, sizes and aspect ratios required for ai training"; +static char args_doc[] = "FILE(S)"; + +static struct argp_option options[] = +{ + {"verbose", 'v', 0, 0, "Show debug messages" }, + {"quiet", 'q', 0, 0, "only output data" }, + {"model", 'm', "[FILENAME]", 0, "YoloV8 model to use for detection" }, + {"classes", 'c', "[FILENAME]", 0, "classes text file to use" }, + {"out", 'o', "[DIRECTORY]", 0, "directory whre images are to be saved" }, + {"debug", 'd', 0, 0, "output debug images" }, + {"seam-carving", 's', 0, 0, "use seam carving to change image aspect ratio instead of croping"}, + {"size", 'z', "[PIXELS]", 0, "target output size, default: 512"}, + {"focus-person", 'f', "[FILENAME]", 0, "a file name to an image of a person that the crop should focus on"}, + {"person-threshold", 't', "[NUMBER]", 0, "the threshold at witch to consider a person matched, defaults to 0.363"}, + {0} +}; + +struct Config +{ + std::vector imagePaths; + std::filesystem::path modelPath; + std::filesystem::path classesPath; + std::filesystem::path outputDir; + std::filesystem::path focusPersonImage; + bool seamCarving = false; + bool debug = false; + double threshold = 0.363; + cv::Size targetSize = cv::Size(512, 512); +}; + +static error_t parse_opt (int key, char *arg, struct argp_state *state) +{ + Config *config = reinterpret_cast(state->input); + try + { + switch (key) + { + case 'q': + Log::level = Log::ERROR; + break; + case 'v': + Log::level = Log::DEBUG; + break; + case 'm': + config->modelPath = arg; + break; + case 'c': + config->classesPath = arg; + break; + case 'd': + config->debug = true; + break; + case 'o': + config->outputDir.assign(arg); + break; + case 's': + config->seamCarving = true; + break; + case 'f': + config->focusPersonImage = arg; + break; + case 't': + config->threshold = std::atof(arg); + break; + case 'z': + { + int x = std::stoi(arg); + config->targetSize = cv::Size(x, x); + break; + } + case ARGP_KEY_ARG: + config->imagePaths.push_back(arg); + break; + default: + return ARGP_ERR_UNKNOWN; + } + } + catch(const std::invalid_argument& ex) + { + std::cout<(key)<<" is not a valid number.\n"; + return ARGP_KEY_ERROR; + } + return 0; +} + +static struct argp argp = {options, parse_opt, args_doc, doc}; diff --git a/SmartCrop/readfile.h b/SmartCrop/readfile.h new file mode 100644 index 0000000..a6f61be --- /dev/null +++ b/SmartCrop/readfile.h @@ -0,0 +1,35 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include + +inline std::string readFile(const std::filesystem::path& path) +{ + std::ifstream file(path); + if(!file.is_open()) + throw std::runtime_error(std::string("could not open file ") + path.string()); + std::stringstream ss; + ss<. +// + +#include "seamcarving.h" + +#include +#include +#include +#include +#include +#include +#include +#include "log.h" + +bool SeamCarving::strechImage(cv::Mat& image, int seams, bool grow, std::vector>* seamsVect) +{ + cv::Mat newFrame = image.clone(); + assert(!newFrame.empty()); + std::vector> vecSeams; + + for(int i = 0; i < seams; i++) + { + //Gradient Magnitude for intensity of image. + cv::Mat gradientMagnitude = computeGradientMagnitude(newFrame); + //Use DP to create the real energy map that is used for path calculation. + // Strictly using vertical paths for testing simplicity. + cv::Mat pathIntensityMat = computePathIntensityMat(gradientMagnitude); + + if(pathIntensityMat.rows == 0 && pathIntensityMat.cols == 0) + return false; + std::vector seam = getLeastImportantPath(pathIntensityMat); + vecSeams.push_back(seam); + if(seamsVect) + seamsVect->push_back(seam); + + newFrame = removeLeastImportantPath(newFrame, seam); + + if(newFrame.rows == 0 || newFrame.cols == 0) + return false; + } + + if (grow) + { + cv::Mat growMat = image.clone(); + + for(size_t i = 0; i < vecSeams.size(); i++) + { + growMat = addLeastImportantPath(growMat,vecSeams[i]); + } + image = growMat; + } + else + { + image = newFrame; + } + return true; +} + +bool SeamCarving::strechImageVert(cv::Mat& image, int seams, bool grow, std::vector>* seamsVect) +{ + cv::transpose(image, image); + bool ret = strechImage(image, seams, grow, seamsVect); + cv::transpose(image, image); + return ret; +} + +bool SeamCarving::strechImageWithSeamsImage(cv::Mat& image, cv::Mat& seamsImage, int seams, bool grow) +{ + std::vector> seamsVect; + seamsImage = image.clone(); + + bool ret = SeamCarving::strechImage(image, seams, grow, &seamsVect); + if(!ret) + return false; + + for(size_t i = 0; i < seamsVect.size(); ++i) + seamsImage = drawSeam(seamsImage, seamsVect[i]); + return true; +} + +cv::Mat SeamCarving::GetEnergyImg(const cv::Mat &img) +{ + // find partial derivative of x-axis and y-axis seperately + // sum up the partial derivates + float pd[] = {1, 2, 1, 0, 0, 0, -1, -2 - 1}; + cv::Mat xFilter(3, 3, CV_32FC1, pd); + cv::Mat yFilter = xFilter.t(); + cv::Mat grayImg; + cv::cvtColor(img, grayImg, cv::COLOR_RGBA2GRAY); + cv::Mat dxImg; + cv::Mat dyImg; + + cv::filter2D(grayImg, dxImg, 0, xFilter); + cv::filter2D(grayImg, dyImg, 0, yFilter); + //cv::Mat zeroMat = cv::Mat::zeros(dxImg.rows, dxImg.cols, dxImg.type()); + //cv::Mat absDxImg; + //cv::Mat absDyImg; + //cv::absdiff(dxImg, zeroMat, absDxImg); + //cv::absdiff(dyImg, zeroMat, absDyImg); + cv::Mat absDxImg = cv::abs(dxImg); + cv::Mat absDyImg = cv::abs(dyImg); + + cv::Mat energyImg; + cv::add(absDxImg, absDyImg, energyImg); + return energyImg; +} + +cv::Mat SeamCarving::computeGradientMagnitude(const cv::Mat &frame) +{ + cv::Mat grayScale; + cv::cvtColor(frame, grayScale, cv::COLOR_RGBA2GRAY); + cv::Mat drv = cv::Mat(grayScale.size(), CV_16SC1); + cv::Mat drv32f = cv::Mat(grayScale.size(), CV_32FC1); + cv::Mat mag = cv::Mat::zeros(grayScale.size(), CV_32FC1); + Sobel(grayScale, drv, CV_16SC1, 1, 0); + drv.convertTo(drv32f, CV_32FC1); + cv::accumulateSquare(drv32f, mag); + Sobel(grayScale, drv, CV_16SC1, 0, 1); + drv.convertTo(drv32f, CV_32FC1); + cv::accumulateSquare(drv32f, mag); + cv::sqrt(mag, mag); + return mag; +} + +float SeamCarving::intensity(float currIndex, int start, int end) +{ + if(start < 0 || start >= end) + { + return FLT_MAX; + } + else + { + return currIndex; + } +} + +cv::Mat SeamCarving::computePathIntensityMat(const cv::Mat &rawEnergyMap) +{ + cv::Mat pathIntensityMap = cv::Mat(rawEnergyMap.size(), CV_32FC1); + + if(rawEnergyMap.total() == 0 || pathIntensityMap.total() == 0) + { + return cv::Mat(); + } + + //First row of intensity paths is the same as the energy map + rawEnergyMap.row(0).copyTo(pathIntensityMap.row(0)); + float max = 0; + + //The rest of them use the DP calculation using the minimum of the 3 pixels above them + their own intensity. + for(int row = 1; row < pathIntensityMap.rows; row++) + { + for(int col = 0; col < pathIntensityMap.cols; col++) + { + //The initial intensity of the pixel is its raw intensity + float pixelIntensity = rawEnergyMap.at(row, col); + //The minimum intensity from the current path of the 3 pixels above it is added to its intensity. + float p1 = intensity(pathIntensityMap.at(row-1, col-1), col - 1, pathIntensityMap.cols); + float p2 = intensity(pathIntensityMap.at(row-1, col), col, pathIntensityMap.cols); + float p3 = intensity(pathIntensityMap.at(row-1, col+1), col + 1, pathIntensityMap.cols); + + float minIntensity = std::min(p1, p2); + minIntensity = std::min(minIntensity, p3); + + pixelIntensity += minIntensity; + + max = std::max(max, pixelIntensity); + pathIntensityMap.at(row, col) = pixelIntensity; + } + } + return pathIntensityMap; +} + +std::vector SeamCarving::getLeastImportantPath(const cv::Mat &importanceMap) +{ + if(importanceMap.total() == 0) + { + return std::vector(); + } + + //Find the beginning of the least important path. Trying an averaging approach because absolute min wasn't very reliable. + float minImportance = importanceMap.at(importanceMap.rows - 1, 0); + int minCol = 0; + for (int col = 1; col < importanceMap.cols; col++) + { + float currPixel =importanceMap.at(importanceMap.rows - 1, col); + if(currPixel < minImportance) + { + minCol = col; + minImportance = currPixel; + } + } + + std::vector leastEnergySeam(importanceMap.rows); + leastEnergySeam[importanceMap.rows-1] = minCol; + for(int row = importanceMap.rows - 2; row >= 0; row--) + { + float p1 = intensity(importanceMap.at(row, minCol-1), minCol - 1, importanceMap.cols); + float p2 = intensity(importanceMap.at(row, minCol), minCol, importanceMap.cols); + float p3 = intensity(importanceMap.at(row, minCol+1), minCol + 1, importanceMap.cols); + //Adjust the min column for path following + if(p1 < p2 && p1 < p3) + { + minCol -= 1; + } + else if(p3 < p1 && p3 < p2) + { + minCol += 1; + } + leastEnergySeam[row] = minCol; + } + + return leastEnergySeam; +} + +cv::Mat SeamCarving::removeLeastImportantPath(const cv::Mat &original, const std::vector &seam) +{ + cv::Size orgSize = original.size(); + // new mat needs to shrink by one collumn + cv::Size size = cv::Size(orgSize.width-1, orgSize.height); + cv::Mat newMat = cv::Mat(size, original.type()); + + for(size_t row = 0; row < seam.size(); row++) + { + removePixel(original, newMat, row, seam[row]); + } + return newMat; +} + +void SeamCarving::removePixel(const cv::Mat &original, cv::Mat &outputMat, int row, int minCol) +{ + int width = original.cols; + int channels = original.channels(); + int originRowStart = row * channels * width; + int newRowStart = row * channels * (width - 1); + int firstNum = minCol * channels; + unsigned char *rawOrig = original.data; + unsigned char *rawOutput = outputMat.data; + + //std::cout << "originRowStart: " << originRowStart << std::endl; + //std::cout << "newRowStart: " << newRowStart << std::endl; + //std::cout << "firstNum: " << firstNum << std::endl; + memcpy(rawOutput + newRowStart, rawOrig + originRowStart, firstNum); + + int originRowMid = originRowStart + (minCol + 1) * channels; + int newRowMid = newRowStart + minCol * channels; + int secondNum = (width - 1) * channels - firstNum; + + //std::cout << "originRowMid: " << originRowMid << std::endl; + //std::cout << "newRowMid: " << newRowMid << std::endl; + //std::cout << "secondNum: " << secondNum << std::endl; + memcpy(rawOutput + newRowMid, rawOrig + originRowMid, secondNum); + + int leftPixel = minCol - 1; + int rightPixel = minCol + 1; + + int byte1 = rawOrig[originRowStart + minCol * channels]; + int byte2 = rawOrig[originRowStart + minCol * channels + 1]; + int byte3 = rawOrig[originRowStart + minCol * channels + 2]; + + if (rightPixel < width) + { + int byte1R = rawOrig[originRowStart + rightPixel * channels]; + int byte2R = rawOrig[originRowStart + rightPixel * channels + 1]; + int byte3R = rawOrig[originRowStart + rightPixel * channels + 2]; + rawOutput[newRowStart + minCol * channels] = (unsigned char)((byte1 + byte1R) / 2); + rawOutput[newRowStart + minCol * channels + 1] = (unsigned char)((byte2 + byte2R) / 2); + rawOutput[newRowStart + minCol * channels + 2] = (unsigned char)((byte3 + byte3R) / 2); + } + + if(leftPixel >= 0) + { + int byte1L = rawOrig[originRowStart + leftPixel*channels]; + int byte2L = rawOrig[originRowStart + leftPixel*channels+1]; + int byte3L = rawOrig[originRowStart + leftPixel*channels+2]; + rawOutput[newRowStart + leftPixel*channels] = (unsigned char) ((byte1 + byte1L)/2); + rawOutput[newRowStart + leftPixel*channels+1] = (unsigned char) ((byte2 + byte2L)/2); + rawOutput[newRowStart + leftPixel*channels+2] = (unsigned char) ((byte3 + byte3L)/2); + } +} + +cv::Mat SeamCarving::addLeastImportantPath(const cv::Mat &original, const std::vector &seam) +{ + cv::Size orgSize = original.size(); + // new mat needs to grow by one column + cv::Size size = cv::Size(orgSize.width+1, orgSize.height); + cv::Mat newMat = cv::Mat(size, original.type()); + + for(size_t row = 0; row < seam.size(); row++) + { + //std::cout << "row: " << row << ", col: " << seam[row] << std::endl; + addPixel(original, newMat, row, seam[row]); + } + return newMat; +} + +void SeamCarving::addPixel(const cv::Mat &original, cv::Mat &outputMat, int row, int minCol) +{ + int width = original.cols; + int channels = original.channels(); + int originRowStart = row * channels * width; + int newRowStart = row * channels * (width + 1); + int firstNum = (minCol + 1) * channels; + + unsigned char *rawOrig = original.data; + unsigned char *rawOutput = outputMat.data; + + memcpy(rawOutput + newRowStart, rawOrig + originRowStart, firstNum); + + memcpy(rawOutput + newRowStart + firstNum, rawOrig + originRowStart + firstNum, channels); + + int originRowMid = originRowStart + ((minCol + 1) * channels); + int newRowMid = newRowStart + ((minCol + 2) * channels); + int secondNum = (width * channels) - firstNum; + + memcpy(rawOutput + newRowMid, rawOrig + originRowMid, secondNum); + + int leftPixel = minCol - 1; + int rightPixel = minCol + 1; + + int byte1 = rawOrig[originRowStart + minCol * channels]; + int byte2 = rawOrig[originRowStart + minCol * channels + 1]; + int byte3 = rawOrig[originRowStart + minCol * channels + 2]; + + if (rightPixel < width) + { + int byte1R = rawOrig[originRowStart + rightPixel * channels]; + int byte2R = rawOrig[originRowStart + rightPixel * channels + 1]; + int byte3R = rawOrig[originRowStart + rightPixel * channels + 2]; + rawOutput[newRowStart + minCol * channels] = (unsigned char)((byte1 + byte1R) / 2); + rawOutput[newRowStart + minCol * channels + 1] = (unsigned char)((byte2 + byte2R) / 2); + rawOutput[newRowStart + minCol * channels + 2] = (unsigned char)((byte3 + byte3R) / 2); + } + + if(leftPixel >= 0) + { + int byte1L = rawOrig[originRowStart + leftPixel*channels]; + int byte2L = rawOrig[originRowStart + leftPixel*channels+1]; + int byte3L = rawOrig[originRowStart + leftPixel*channels+2]; + rawOutput[newRowStart + leftPixel*channels] = (unsigned char) ((byte1 + byte1L)/2); + rawOutput[newRowStart + leftPixel*channels+1] = (unsigned char) ((byte2 + byte2L)/2); + rawOutput[newRowStart + leftPixel*channels+2] = (unsigned char) ((byte3 + byte3L)/2); + } +} + +cv::Mat SeamCarving::drawSeam(const cv::Mat &frame, const std::vector &seam) +{ + cv::Mat retMat = frame.clone(); + for(int row = 0; row < frame.rows; row++) + { + for(int col = 0; col < frame.cols; col++) + { + retMat.at(row, seam[row])[0] = 0; + retMat.at(row, seam[row])[1] = 255; + retMat.at(row, seam[row])[2] = 0; + } + } + return retMat; +} diff --git a/SmartCrop/seamcarving.h b/SmartCrop/seamcarving.h new file mode 100644 index 0000000..e306694 --- /dev/null +++ b/SmartCrop/seamcarving.h @@ -0,0 +1,43 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once + +#include +#include + +class SeamCarving +{ +private: + static cv::Mat GetEnergyImg(const cv::Mat &img); + static cv::Mat computeGradientMagnitude(const cv::Mat &frame); + static float intensity(float currIndex, int start, int end); + static cv::Mat computePathIntensityMat(const cv::Mat &rawEnergyMap); + static std::vector getLeastImportantPath(const cv::Mat &importanceMap); + static cv::Mat removeLeastImportantPath(const cv::Mat &original, const std::vector &seam); + static void removePixel(const cv::Mat &original, cv::Mat &outputMap, int row, int minCol); + static cv::Mat addLeastImportantPath(const cv::Mat &original, const std::vector &seam); + static void addPixel(const cv::Mat &original, cv::Mat &outputMat, int row, int minCol); + static cv::Mat drawSeam(const cv::Mat &frame, const std::vector &seam); + +public: + static bool strechImage(cv::Mat& image, int seams, bool grow, std::vector>* seamsVect = nullptr); + static bool strechImageVert(cv::Mat& image, int seams, bool grow, std::vector>* seamsVect = nullptr); + static bool strechImageWithSeamsImage(cv::Mat& image, cv::Mat& seamsImage, int seams, bool grow); +}; diff --git a/SmartCrop/tokenize.cpp b/SmartCrop/tokenize.cpp new file mode 100644 index 0000000..c565fc2 --- /dev/null +++ b/SmartCrop/tokenize.cpp @@ -0,0 +1,46 @@ +// +// SmartCrop - A tool for content aware croping of images +// Copyright (C) 2024 Carl Philipp Klemm +// +// This file is part of SmartCrop. +// +// SmartCrop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SmartCrop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SmartCrop. If not, see . +// + +#include "tokenize.h" + + +std::vector tokenizeBinaryIgnore(const std::string& str, const char delim, const char ignoreBraket, const char escapeChar) +{ + std::vector tokens; + std::string token; + bool inBaracket = false; + for(size_t i = 0; i < str.size(); ++i) + { + if(str[i] == delim && !inBaracket && (i == 0 || str[i-1] != escapeChar)) + { + tokens.push_back(token); + token.clear(); + } + else + { + token.push_back(str[i]); + } + if(ignoreBraket == str[i]) + inBaracket = !inBaracket; + } + if(!inBaracket) + tokens.push_back(token); + return tokens; +} diff --git a/SmartCrop/tokenize.h b/SmartCrop/tokenize.h new file mode 100644 index 0000000..85d7321 --- /dev/null +++ b/SmartCrop/tokenize.h @@ -0,0 +1,26 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once + +#include +#include + +std::vector tokenizeBinaryIgnore(const std::string& str, const char delim, const char ignoreBraket = '\0', + const char escapeChar = '\0'); diff --git a/SmartCrop/utils.cpp b/SmartCrop/utils.cpp new file mode 100644 index 0000000..b9b1104 --- /dev/null +++ b/SmartCrop/utils.cpp @@ -0,0 +1,80 @@ +// +// SmartCrop - A tool for content aware croping of images +// Copyright (C) 2024 Carl Philipp Klemm +// +// This file is part of SmartCrop. +// +// SmartCrop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SmartCrop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SmartCrop. If not, see . +// + +#include "utils.h" + +#include +#include +#include + +bool isImagePath(const std::filesystem::path& path) +{ + return std::filesystem::is_regular_file(path) && (path.extension() == ".png" || path.extension() == ".jpg" || path.extension() == ".jpeg"); +} + +void getImageFiles(const std::filesystem::path& path, std::vector& paths) +{ + if(isImagePath(path)) + { + paths.push_back(path); + } + else if(std::filesystem::is_directory(path)) + { + for(const std::filesystem::directory_entry& dirent : std::filesystem::directory_iterator(path)) + { + if(std::filesystem::is_directory(dirent.path())) + getImageFiles(dirent.path(), paths); + else if(isImagePath(dirent.path())) + paths.push_back(dirent.path()); + } + } +} + +cv::Rect rectFromPoints(const std::vector>& points) +{ + int left = std::numeric_limits::max(); + int right = std::numeric_limits::min(); + int top = std::numeric_limits::max(); + int bottom = std::numeric_limits::min(); + + for(const std::pair& point : points) + { + left = point.first.x < left ? point.first.x : left; + right = point.first.x > right ? point.first.x : right; + + top = point.first.y < top ? point.first.y : top; + bottom = point.first.y > bottom ? point.first.y : bottom; + } + + return cv::Rect(left, top, right-left, bottom-top); +} + +double pointDist(const cv::Point2i& pointA, const cv::Point2i& pointB) +{ + cv::Vec2i a(pointA.x, pointA.y); + cv::Vec2i b(pointB.x, pointB.y); + return cv::norm(a-b); +} + +bool pointInRect(const cv::Point2i& point, const cv::Rect& rect) +{ + return point.x >= rect.x && point.x <= rect.x+rect.width && + point.y >= rect.y && point.y <= rect.y+rect.height; +} diff --git a/SmartCrop/utils.h b/SmartCrop/utils.h new file mode 100644 index 0000000..ea13523 --- /dev/null +++ b/SmartCrop/utils.h @@ -0,0 +1,34 @@ +/* * SmartCrop - A tool for content aware croping of images + * Copyright (C) 2024 Carl Philipp Klemm + * + * This file is part of SmartCrop. + * + * SmartCrop is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SmartCrop is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SmartCrop. If not, see . + */ + +#pragma once + +#include +#include +#include + +bool isImagePath(const std::filesystem::path& path); + +void getImageFiles(const std::filesystem::path& path, std::vector& paths); + +cv::Rect rectFromPoints(const std::vector>& points); + +double pointDist(const cv::Point2i& pointA, const cv::Point2i& pointB); + +bool pointInRect(const cv::Point2i& point, const cv::Rect& rect); diff --git a/SmartCrop/yolo.cpp b/SmartCrop/yolo.cpp new file mode 100644 index 0000000..531df96 --- /dev/null +++ b/SmartCrop/yolo.cpp @@ -0,0 +1,278 @@ +// +// SmartCrop - A tool for content aware croping of images +// Copyright (C) 2024 Carl Philipp Klemm +// +// This file is part of SmartCrop. +// +// SmartCrop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// SmartCrop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with SmartCrop. If not, see . +// + +#include +#include +#include +#include + +#include "yolo.h" +#include "readfile.h" +#include "tokenize.h" +#include "log.h" + +#define INCBIN_PREFIX r +#include "incbin.h" + +INCTXT(defaultClasses, WEIGHT_DIR "/classes.txt"); +INCBIN(defaultModel, WEIGHT_DIR "/yolov8x.onnx"); + +Yolo::Yolo(const std::filesystem::path &onnxModelPath, const cv::Size &modelInputShape, + const std::filesystem::path& classesTxtFilePath, bool runWithOCl) +{ + modelPath = onnxModelPath; + modelShape = modelInputShape; + + if(classesTxtFilePath.empty()) + { + Log(Log::INFO)<<"Using builtin classes"; + loadClasses(rdefaultClassesData); + } + else + { + std::string classesStr = readFile(classesTxtFilePath); + loadClasses(classesStr); + } + + if(!modelPath.empty()) + { + net = cv::dnn::readNetFromONNX(modelPath); + } + else + { + Log(Log::INFO)<<"Using builtin yolo model"; + net = cv::dnn::readNetFromONNX((const char*)rdefaultModelData, rdefaultModelSize); + } + if(runWithOCl) + { + net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT); + net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL); + } + else + { + net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV); + net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); + } +} + +std::vector Yolo::runInference(const cv::Mat &input) +{ + cv::Mat modelInput = input; + if (letterBoxForSquare && modelShape.width == modelShape.height) + modelInput = formatToSquare(modelInput); + + cv::Mat blob; + cv::dnn::blobFromImage(modelInput, blob, 1.0/255.0, modelShape, cv::Scalar(), true, false); + net.setInput(blob); + + std::vector outputs; + net.forward(outputs, net.getUnconnectedOutLayersNames()); + + int rows = outputs[0].size[1]; + int dimensions = outputs[0].size[2]; + + bool yolov8 = false; + // yolov5 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c]) + // yolov8 has an output of shape (batchSize, 84, 8400) (Num classes + box[x,y,w,h]) + if (dimensions > rows) // Check if the shape[2] is more than shape[1] (yolov8) + { + yolov8 = true; + rows = outputs[0].size[2]; + dimensions = outputs[0].size[1]; + + outputs[0] = outputs[0].reshape(1, dimensions); + cv::transpose(outputs[0], outputs[0]); + } + float *data = (float *)outputs[0].data; + + float x_factor = modelInput.cols / modelShape.width; + float y_factor = modelInput.rows / modelShape.height; + + std::vector class_ids; + std::vector confidences; + std::vector boxes; + + for (int i = 0; i < rows; ++i) + { + if (yolov8) + { + float *classes_scores = data+4; + + cv::Mat scores(1, classes.size(), CV_32FC1, classes_scores); + cv::Point class_id; + double maxClassScore; + + minMaxLoc(scores, 0, &maxClassScore, 0, &class_id); + + if (maxClassScore > modelScoreThreshold) + { + confidences.push_back(maxClassScore); + class_ids.push_back(class_id.x); + + float x = data[0]; + float y = data[1]; + float w = data[2]; + float h = data[3]; + + int left = int((x - 0.5 * w) * x_factor); + int top = int((y - 0.5 * h) * y_factor); + + int width = int(w * x_factor); + int height = int(h * y_factor); + + boxes.push_back(cv::Rect(left, top, width, height)); + } + } + else // yolov5 + { + float confidence = data[4]; + + if (confidence >= modelConfidenceThreshold) + { + float *classes_scores = data+5; + + cv::Mat scores(1, classes.size(), CV_32FC1, classes_scores); + cv::Point class_id; + double max_class_score; + + minMaxLoc(scores, 0, &max_class_score, 0, &class_id); + + if (max_class_score > modelScoreThreshold) + { + confidences.push_back(confidence); + class_ids.push_back(class_id.x); + + float x = data[0]; + float y = data[1]; + float w = data[2]; + float h = data[3]; + + int left = int((x - 0.5 * w) * x_factor); + int top = int((y - 0.5 * h) * y_factor); + + int width = int(w * x_factor); + int height = int(h * y_factor); + + boxes.push_back(cv::Rect(left, top, width, height)); + } + } + } + + data += dimensions; + } + + std::vector nms_result; + cv::dnn::NMSBoxes(boxes, confidences, modelScoreThreshold, modelNMSThreshold, nms_result); + + std::vector detections{}; + for(unsigned long i = 0; i < nms_result.size(); ++i) + { + int idx = nms_result[i]; + + Yolo::Detection result; + result.class_id = class_ids[idx]; + result.confidence = confidences[idx]; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(100, 255); + result.color = cv::Scalar(dis(gen), + dis(gen), + dis(gen)); + + result.className = classes[result.class_id].first; + result.priority = classes[result.class_id].second; + clampBox(boxes[idx], input.size()); + result.box = boxes[idx]; + detections.push_back(result); + } + + return detections; +} + + +void Yolo::clampBox(cv::Rect& box, const cv::Size& size) +{ + if(box.x < 0) + { + box.width += box.x; + box.x = 0; + } + if(box.y < 0) + { + box.height += box.y; + box.y = 0; + } + if(box.x+box.width > size.width) + box.width = size.width - box.x; + if(box.y+box.height > size.height) + box.height = size.height - box.y; +} + +void Yolo::loadClasses(const std::string& classesStr) +{ + std::vector candidateClasses = tokenizeBinaryIgnore(classesStr, '\n', '"', '\\'); + classes.clear(); + for(std::string& instance : candidateClasses) + { + if(instance.size() < 2) + continue; + + std::vector tokens = tokenizeBinaryIgnore(instance, ',', '"', '\\'); + + if(*tokens[0].begin() == '"') + instance.erase(tokens[0].begin()); + if(tokens[0].back() == '"') + tokens[0].pop_back(); + int priority = -1; + if(tokens.size() > 1) + { + try + { + priority = std::stoi(tokens[1]); + } + catch(const std::invalid_argument& err) + { + Log(Log::WARN)<<"unable to get priority for class "<. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +class Yolo +{ +public: + struct Detection + { + int class_id = 0; + std::string className; + float confidence = 0.0; + int priority = -1; + cv::Scalar color; + cv::Rect box; + }; + +private: + static constexpr float modelConfidenceThreshold = 0.25; + static constexpr float modelScoreThreshold = 0.45; + static constexpr float modelNMSThreshold = 0.50; + + std::string modelPath; + std::vector> classes; + cv::Size2f modelShape; + bool letterBoxForSquare = true; + cv::dnn::Net net; + + void loadClasses(const std::string& classes); + void loadOnnxNetwork(const std::filesystem::path& path); + cv::Mat formatToSquare(const cv::Mat &source); + static void clampBox(cv::Rect& box, const cv::Size& size); + +public: + Yolo(const std::filesystem::path &onnxModelPath = "", const cv::Size& modelInputShape = {640, 480}, + const std::filesystem::path& classesTxtFilePath = "", bool runWithOCl = true); + std::vector runInference(const cv::Mat &input); + int getClassForStr(const std::string& str) const; +}; diff --git a/Weights/classes.txt b/Weights/classes.txt new file mode 100644 index 0000000..a56ca8e --- /dev/null +++ b/Weights/classes.txt @@ -0,0 +1,80 @@ +person, 10 +bicycle, 4 +car, 3 +motorcycle, 4 +airplane, 4 +bus, 4 +train, 4 +truck, 3 +boat, 4 +traffic light, 1 +fire hydrant, 1 +stop sign, 1 +parking meter, 1 +bench, 2 +bird, 5 +cat, 6 +dog, 5 +horse, 4 +sheep, 5 +cow, 4 +elephant, 5 +bear, 5 +zebra, 5 +giraffe, 5 +backpack, 3 +umbrella, 3 +handbag, 3 +tie, 3 +suitcase, 2 +frisbee, 3 +skis, 3 +snowboard, 3 +sports ball, 3 +kite, 4 +baseball bat, 3 +baseball glove, 3 +skateboard, 3 +surfboard, 3 +tennis racket, 3 +bottle, 2 +wine glass, 2 +cup, 2 +fork, 1 +knife, 1 +spoon, 1 +bowl, 1 +banana, 1 +apple, 1 +sandwich,1 +orange, 1 +broccoli, 1 +carrot, 1 +hot dog, 1 +pizza, 1 +donut, 2 +cake, 2 +chair, 1 +couch, 1 +potted plant, 1 +bed, 1 +dining table, 1 +toilet, 1 +tv, 1 +laptop, 1 +mouse, 1 +remote, 1 +keyboard, 1 +cell phone, 1 +microwave, 1 +oven, 1 +toaster, 1 +sink, 1 +refrigerator, 1 +book, 1 +clock, 1 +vase, 1 +scissors, 1 +teddy bear, 1 +hair drier, 1 +toothbrush, 1 diff --git a/Weights/face_detection_yunet_2023mar.onnx b/Weights/face_detection_yunet_2023mar.onnx new file mode 100644 index 0000000..f9beb30 Binary files /dev/null and b/Weights/face_detection_yunet_2023mar.onnx differ diff --git a/Weights/face_recognition_sface_2021dec.onnx b/Weights/face_recognition_sface_2021dec.onnx new file mode 100644 index 0000000..6767be0 Binary files /dev/null and b/Weights/face_recognition_sface_2021dec.onnx differ