{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Post-processing of HUMAnN data\n",
    "\n",
    "This section outlines how to generate some basic statistics and plots from the output of the HUMAnN analysis pipeline used in UNLOCK.\n",
    "\n",
    "This section is written for output from [HUMAnN 3.0](https://github.com/biobakery/biobakery/wiki/humann3).\n",
    "\n",
    "```{note}\n",
    "Please note that this document is a work in progress.\n",
    "```\n",
    "\n",
    "To follow this guide, install the following dependencies (if not installed yet):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    !humann --version\n",
    "except:\n",
    "    %pip install humann\n",
    "\n",
    "try:\n",
    "    import matplotlib\n",
    "except ImportError:\n",
    "    %pip install matplotlib\n",
    "\n",
    "try:\n",
    "    import pandas\n",
    "except ImportError:\n",
    "    %pip install pandas\n",
    "\n",
    "try:\n",
    "    import seaborn\n",
    "except ImportError:\n",
    "    %pip install seaborn\n",
    "\n",
    "try:\n",
    "    import numpy\n",
    "except ImportError:\n",
    "    %pip install numpy\n",
    "\n",
    "try:\n",
    "    import scipy\n",
    "except ImportError:\n",
    "    %pip install scipy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Connection settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.session import iRODSSession\n",
    "import ssl\n",
    "from getpass import getpass\n",
    "\n",
    "# iRODS authentication information\n",
    "host = \"data.m-unlock.nl\"\n",
    "port = 1247\n",
    "zone = \"unlock\"\n",
    "user = \"<SRAM USERNAME>\"\n",
    "password = getpass()\n",
    "\n",
    "# SSL settings\n",
    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
    "\n",
    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
    "                'irods_encryption_key_size': 32,\n",
    "                'irods_encryption_num_hash_rounds': 16,\n",
    "                'irods_encryption_salt_size': 8,\n",
    "                \"irods_authentication_scheme\": \"native\",\n",
    "                'ssl_context': context}\n",
    "\n",
    "print(\"Authenticating to iRODS as... \" + user)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "irods_env_dir = \"~/.irods\"\n",
    "irods_env_file = \"irods_environment.json\"\n",
    "\n",
    "\n",
    "from pathlib import Path\n",
    "from ibridges import Session\n",
    "from getpass import getpass\n",
    "\n",
    "env_loc = Path(irods_env_dir) / irods_env_file\n",
    "env_file = Path.expanduser(Path(env_loc))\n",
    "\n",
    "password = getpass()\n",
    "session = Session(env_file, password=password)\n",
    "\n",
    "if session:\n",
    "    print(\"Session succesfully established\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data retrieval\n",
    "This will download only the humann subfolders in your specific study\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "## set study and make download dir\n",
    "\n",
    "from ibridges import IrodsPath\n",
    "from pathlib import Path\n",
    "\n",
    "# Set investigation and study\n",
    "investigation = \"gutbrain_axis\"\n",
    "study = \"mind-set\"\n",
    "\n",
    "# Define where to download files locally - this creates a relative path from where this notebook is located\n",
    "download_path = f\"./unlock_downloads/{investigation}/{study}\"\n",
    "\n",
    "# Create the directory if it doesn't exist yet\n",
    "download_dir = Path.expanduser(Path(download_path))\n",
    "download_dir.mkdir( parents=True, exist_ok=True )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ibridges import IrodsPath, search_data, download\n",
    "from pathlib import Path\n",
    "import os\n",
    "\n",
    "# Recursively downloads all humann3 folders\n",
    "folder_pattern=\"humann3\"\n",
    "# Define search path\n",
    "search = f\"/unlock/home/wur.{investigation}/stu_{study}/\"\n",
    "data = search_data(session, path=IrodsPath(session, search), path_pattern=folder_pattern)\n",
    "\n",
    "# Set options and counters\n",
    "overwrite = False\n",
    "downloaded, skipped = 0,0\n",
    "unique_folders = []\n",
    "\n",
    "# Go through the search results and download all specified folders\n",
    "for item in data:\n",
    "    irods_path = IrodsPath(session, item) # Create an IrodsPath object for the item\n",
    "    run = irods_path.parent.name # Extract the name of the parent folder\n",
    "    local_destination = Path(download_path) / run     # Construct the local destination path\n",
    "    if item.collection_exists():  # Only process directories (collections)\n",
    "        if not local_destination.exists() or overwrite:\n",
    "            local_destination.mkdir(parents=True, exist_ok=True)\n",
    "            download(session, item, local_destination, overwrite=overwrite)\n",
    "            downloaded += 1\n",
    "        elif len(os.listdir(local_destination)) == 0:\n",
    "            download(session, item, local_destination, overwrite=overwrite)\n",
    "            downloaded += 1\n",
    "        else:\n",
    "            skipped += 1\n",
    "    elif item not in unique_folders:  # Avoid downloading already processed items\n",
    "        unique_folders.append(item)\n",
    "\n",
    "# Print download summary\n",
    "print(\"\\nDownloaded: \", downloaded)\n",
    "print(\"Skipped: \", skipped)\n",
    "print(\"Total unique folders processed:\", len(unique_folders))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set study input folder\n",
    "Let's set up the specific study input folder:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "# Set the input folder\n",
    "input_folder = \"path/to/the/input/data\"\n",
    "\n",
    "# Verify the path\n",
    "if not os.path.exists(input_folder):\n",
    "    print(f\"Error: The folder '{input_folder}' does not exist.\")\n",
    "else:\n",
    "    print(f\"Using input folder: {os.path.abspath(input_folder)}\")\n",
    "\n",
    "output_folder = f\"{input_folder}/output\"\n",
    "os.makedirs(output_folder, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join tables\n",
    "We will merge the gene family and pathway abundance outputs into a single table, but first we need to flatten the folder structure by copying all relevant .tsv files into the same directory (by setting the corresponding suffixes). Note that we can use any of the variety of HUMAnN output files, below are just some illustrative examples but pick the normalization and regrouping methods best suited to your analysis:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "import glob\n",
    "\n",
    "genefamily_suffix = \"genefamilies-community-relab_go\"\n",
    "genefamily_files = glob.glob(os.path.join(input_folder, \"**\", f\"*_{genefamily_suffix}.tsv\"), recursive=True)\n",
    "for file in genefamily_files:\n",
    "    shutil.copy2(file, output_folder)\n",
    "\n",
    "pathabundance_suffix = \"pathabundance-community-cpm\"\n",
    "pathabundance_files = glob.glob(os.path.join(input_folder, \"**\", f\"*_{pathabundance_suffix}.tsv\"), recursive=True)\n",
    "for file in pathabundance_files:\n",
    "    shutil.copy2(file, output_folder)\n",
    "    \n",
    "print(f\"Copied {len(genefamily_files)} genefamily files to {output_folder}\")\n",
    "print(f\"Copied {len(pathabundance_files)} pathabundance files to {output_folder}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we can use the humann_join_tables script, this might take a while depending on the amount of files, especially for the genefamilies:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "!humann_join_tables --input \"{output_folder}\" \\\n",
    "--output \"{output_folder}/{genefamily_suffix}_joined.tsv\" \\\n",
    "--file_name {genefamily_suffix}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "!humann_join_tables --input \"{output_folder}\" \\\n",
    "--output \"{output_folder}/{pathabundance_suffix}_joined.tsv\" \\\n",
    "--file_name {pathabundance_suffix}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's clean up the flattened folder structure:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in genefamily_files:\n",
    "    try:\n",
    "        os.remove(os.path.join(output_folder, os.path.basename(file)))\n",
    "    except FileNotFoundError:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in pathabundance_files:\n",
    "    try:\n",
    "        os.remove(os.path.join(output_folder, os.path.basename(file)))\n",
    "    except FileNotFoundError:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we can create some basic visualizations based on our merged tsv files. Humann also contains a barplot function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "!humann_barplot -h"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use this barplot function to create a differentially abundant pathway png file in our output folder, it is prudent to add metadata to elucidate patterns in your data. See the HUMAnN [manual](https://github.com/biobakery/biobakery/wiki/humann3#5-plotting-stratified-functions) for more information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "!humann_barplot --input \"{output_folder}/{pathabundance_suffix}_joined.tsv\" \\\n",
    "--focal-feature \"1CMET2-PWY\" \\\n",
    "--sort sum \\\n",
    "--output \"{output_folder}/barplot_{pathabundance_suffix}_joined.png\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can take a glance at the most abundant gene families:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Load the merged HUMAnN output file\n",
    "genefamily_file = rf\"{output_folder}\\{genefamily_suffix}_joined.tsv\"\n",
    "df = pd.read_csv(genefamily_file, sep=\"\\t\")\n",
    "\n",
    "# Ensure the first column is treated as the index (features)\n",
    "df.set_index(df.columns[0], inplace=True)\n",
    "\n",
    "# Sum across samples (if multiple columns exist)\n",
    "df[\"Total_Abundance\"] = df.sum(axis=1)\n",
    "\n",
    "# Select the top 10 most abundant features\n",
    "top_features = df.nlargest(10, \"Total_Abundance\")\n",
    "\n",
    "# Plot\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.barh(top_features.index, top_features[\"Total_Abundance\"], color=\"skyblue\")\n",
    "plt.xlabel(\"Total Abundance\")\n",
    "plt.ylabel(\"Feature\")\n",
    "plt.title(\"Top 10 Most Abundant Features\")\n",
    "plt.gca().invert_yaxis()  # Highest at the top\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also generate some global summary statistics:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the gene family abundance data\n",
    "genefamily_file = rf\"{output_folder}\\{genefamily_suffix}_joined.tsv\"\n",
    "df = pd.read_csv(genefamily_file, sep='\\t', index_col=0)\n",
    "\n",
    "# Calculate global summary statistics\n",
    "summary_stats = pd.DataFrame({\n",
    "    \"mean\": [df.values.mean()],\n",
    "    \"std\": [df.values.std()],\n",
    "    \"median\": [np.median(df.values)],\n",
    "    \"min\": [df.values.min()],\n",
    "    \"max\": [df.values.max()]\n",
    "})\n",
    "print(summary_stats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As well as statistics on the first 10 samples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_stats = df.describe().T.rename(columns={'50%': 'median'})[['mean', 'std', 'median', 'min', 'max']]\n",
    "summary_stats = summary_stats.head(10)  # or use .sample(10) for random\n",
    "print(summary_stats)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Testing_nbs",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}