{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Vignette: Using the Synderm dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import get_token\n", "from torch.utils.data import DataLoader\n", "from huggingface_hub import HfApi\n", "import matplotlib.pyplot as plt\n", "import webdataset as wds\n", "from PIL import Image\n", "import pandas as pd\n", "import json\n", "import io\n", "import re" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/synthetic-derm\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/workspace/synthetic-derm/.venv/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" ] } ], "source": [ "# Set path to root directory of package\n", "%cd ../../../" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset Statistics\n", "First, we will show some dataset statistics. Since the dataset is so large (about 1 million images), we have crawled the complete dataset beforehand and generated a csv file at `huggingface/folder_counts.csv`. This lists the total png image count for each combination of disease and synthetic generation type (pretrained/finetuned and inpaint/outpaint/text-to-image)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"huggingface/folder_counts.csv\")\n", "df = df.rename(columns={\"Image Count\": \"count\", \"Folder\": \"folder\"})\n", "\n", "# Define the pattern components\n", "extract = {\n", " \"submethods\": [\"inpaint\", \"inpaint_outpaint\", \"text_to_image\"],\n", " \"methods\": [\"inpaint\", \"text_to_image\"],\n", " \"ft\": [\"finetune\", \"pretrained\"]\n", "}\n", "\n", "def match_replace(str, values):\n", " for value in values:\n", " if str.endswith(value):\n", " stop_index = len(str) - len(value) - 1 # Account for underscore\n", " str = str[0:stop_index]\n", " return str, value\n", " return str, None # Return default if no match found\n", "\n", "df[\"label\"] = df[\"folder\"]\n", "for key, values in extract.items():\n", " result = df[\"label\"].apply(lambda x: match_replace(x, values))\n", " df[\"label\"] = result.apply(lambda x: x[0])\n", " df[key] = result.apply(lambda x: x[1])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | folder | \n", "count | \n", "label | \n", "submethods | \n", "methods | \n", "ft | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "acne_finetune_text_to_image_text_to_image | \n", "1260 | \n", "acne | \n", "text_to_image | \n", "text_to_image | \n", "finetune | \n", "
| 1 | \n", "acne_vulgaris_finetune_text_to_image_text_to_i... | \n", "2540 | \n", "acne_vulgaris | \n", "text_to_image | \n", "text_to_image | \n", "finetune | \n", "
| 2 | \n", "actinic_keratosis_finetune_text_to_image_text_... | \n", "2940 | \n", "actinic_keratosis | \n", "text_to_image | \n", "text_to_image | \n", "finetune | \n", "
| 3 | \n", "all_finetune_inpaint_inpaint | \n", "35300 | \n", "all | \n", "inpaint | \n", "inpaint | \n", "finetune | \n", "
| 4 | \n", "all_finetune_inpaint_inpaint_outpaint | \n", "35300 | \n", "all | \n", "inpaint_outpaint | \n", "inpaint | \n", "finetune | \n", "