{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bracken taxonomic abundances\n",
    "\n",
    "Below is an example on how to analyse bracken data of multiple samples.\n",
    "\n",
    "```{note}\n",
    "Please note that this document is a work in progress.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install python-irodsclient\n",
    "# !pip install matplotlib\n",
    "# !pip install adjustText\n",
    "# !pip install scikit-learn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Connection settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.session import iRODSSession\n",
    "import ssl\n",
    "from irods.meta import iRODSMeta\n",
    "import os, sys\n",
    "from getpass import getpass\n",
    "\n",
    "# iRODS authentication information\n",
    "host = \"data.m-unlock.nl\"\n",
    "port = 1247\n",
    "zone = \"unlock\"\n",
    "user = \"your SRAM username\"\n",
    "password = getpass()\n",
    "\n",
    "# SSL settings\n",
    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
    "\n",
    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
    "                'irods_encryption_key_size': 32,\n",
    "                'irods_encryption_num_hash_rounds': 16,\n",
    "                'irods_encryption_salt_size': 8,\n",
    "                \"irods_authentication_scheme\": \"pam\",\n",
    "                'ssl_context': context}\n",
    "\n",
    "print(\"Authenticating to iRODS as... \" + user)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.models import Collection, DataObject\n",
    "from irods.column import Criterion\n",
    "import os\n",
    "\n",
    "# Which datafiles do you want to retrieve\n",
    "\n",
    "# Denote them as they are in iRODS use \"%\" for all\n",
    "GROUP = \"wur.fdp\"\n",
    "STUDY = \"%\"\n",
    "\n",
    "run_id = \"ReadTaxonomy\"\n",
    "k2_database = \"%\"\n",
    "bracken_level = \"%\"\n",
    "\n",
    "## Download files\n",
    "file_filter = \"_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\"\n",
    "# Standard authentication\n",
    "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n",
    "    # Set the filters to select the files needed for downloading\n",
    "    results = session.query(Collection, DataObject).filter( \\\n",
    "        Criterion('like', Collection.name, '%'+GROUP+'/'+STUDY+'%')).filter( \\\n",
    "        Criterion('like', DataObject.name, '%_'+file_filter))\n",
    "    \n",
    "    retrieved = set()\n",
    "    for index, r in enumerate(results):\n",
    "        # Download the result files\n",
    "        data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n",
    "        # Original path in irods is local path from current location\n",
    "        if not os.path.exists(\".\" + data_file):\n",
    "            os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n",
    "            print(\"Downloading: \", index, data_file, end=\"\\r\")\n",
    "            session.data_objects.get(data_file, \".\" + data_file)\n",
    "        else:\n",
    "            print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")\n",
    "        retrieved.add(\".\" + data_file)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set filters to analyse a subset of the files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to get the sample name from the filename\n",
    "def get_sample_name(filename,run_id, k2_database,bracken_level):\n",
    "    sample_name = filename.removeprefix(run_id+\"_\") #filename.lstrip(runID+\"_\")\n",
    "    sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n",
    "    return(sample_name)\n",
    "\n",
    "tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}\n",
    "\n",
    "STUDY = \"stu_bmock12\"\n",
    "run_id = \"ReadTaxonomy\"\n",
    "\n",
    "k2_database = \"k2_pluspfp_20240112\"\n",
    "#k2_database = \"kraken2_db_uhgg_v2.0.2\"\n",
    "bracken_level = \"S\"\n",
    "\n",
    "# \"filtered\" or \"unfiltered\"\n",
    "filtered = \"filtered\"\n",
    "\n",
    "filtered_paths = list(filter(lambda path: STUDY in path and \"_\"+filtered+\"_\" in path and k2_database in path and bracken_level+\".txt\" in path, retrieved))\n",
    "\n",
    "print(\"sample name\\tfilename\\tfull_location\")\n",
    "for file_path in filtered_paths:\n",
    "    filename = file_path.split(\"/\")[-1]\n",
    "    sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)\n",
    "    sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n",
    "    print(sample_name+\"\\t\"+filename+\"\\t\"+file_path)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Concatenate the files containing tabular information into a single file for further processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import glob\n",
    "\n",
    "pd.set_option('display.max_rows', 500)\n",
    "pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.width', 1000)\n",
    "\n",
    "# Initialize an empty DataFrame\n",
    "merged_df = pd.DataFrame()\n",
    "\n",
    "# Loop over each file path in the filtered list\n",
    "for file_path in filtered_paths:\n",
    "    # Read the file into a DataFrame\n",
    "    df = pd.read_csv(file_path, sep='\\t', index_col=None, header=0)\n",
    "    \n",
    "    # Get clean filename to get the sample name\n",
    "    filename = file_path.split(\"/\")[-1]\n",
    "    sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)\n",
    "    # Add the filename as a column in the DataFrame\n",
    "    df['sample'] = sample_name\n",
    "    \n",
    "    # Reset the index of the DataFrame to avoid duplicate indices\n",
    "    # 'drop=True' prevents the old indices from being added as a new column\n",
    "    # 'inplace=True' makes the change in the existing DataFrame\n",
    "    df.reset_index(drop=True, inplace=True)\n",
    "    \n",
    "    # Concatenate the current DataFrame with the merged DataFrame\n",
    "    # 'ignore_index=True' resets the index in the final DataFrame\n",
    "    merged_df = pd.concat([merged_df, df], ignore_index=True)\n",
    "    \n",
    "# Multiple the abundance column by 100 to get percentages\n",
    "merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating the plot with an abundance threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.cm as cm\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "\n",
    "# Make a table of the fractions and the samples\n",
    "extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]\n",
    "\n",
    "# When the abundance is below the set value, change the species/genus/family... to 'Other'\n",
    "abundance_threshold = 1\n",
    "# Apply abundance filter\n",
    "extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'\n",
    "\n",
    "# Identify the top 20 names (because we cant have >20 distinct colors)\n",
    "# Sort the DataFrame by 'fraction_total_reads' in descending order\n",
    "extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)\n",
    "top_20_names = extracted_df_sorted['name'].unique()[:20]\n",
    "# Replace names not in the top 20 with 'Other'\n",
    "extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'\n",
    "\n",
    "# Sum the abundance for each species and sample\n",
    "pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)\n",
    "# Remove rows with all zeros\n",
    "pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]\n",
    "\n",
    "# Sum the columns\n",
    "sums = pivot_df.sum(axis=0)\n",
    "# This should be 100 for each column\n",
    "sums = list(sums)\n",
    "print(sums)\n",
    "# Round the sums to 1 decimal places\n",
    "sums = [round(x, 0) for x in sums]\n",
    "# Check if the sums are 100\n",
    "assert sums == [100.0]*len(sums)\n",
    "\n",
    "# Transpose and sort \n",
    "pivot_df_sorted_T = pivot_df.T.sort_index()\n",
    "\n",
    "# Set a custom column and color\n",
    "custom_column = 'Other'\n",
    "custom_colors = {'Other': 'lightgrey'}\n",
    "# Move the custom column to the last position in the DataFrame\n",
    "pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]\n",
    "\n",
    "# Set a colormap\n",
    "cmap = plt.get_cmap('tab20')\n",
    "# Plot a stacked barchart with custom color\n",
    "ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, \n",
    "                     color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])\n",
    "\n",
    "# Angle the x labels\n",
    "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')\n",
    "# Set the y-axis ticks from 0 to 100 in steps of 10\n",
    "ax.yaxis.set_major_locator(MultipleLocator(10))\n",
    "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x)))  # optional: to remove decimal points\n",
    "\n",
    "# XY Label names \n",
    "ax.set_xlabel('Sample', fontsize=16)\n",
    "ax.set_ylabel('Fraction', fontsize=16)\n",
    "# Set title\n",
    "ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+\" level with database \"+k2_database, fontsize=16)\n",
    "# Move legend outside the plot\n",
    "ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PCA plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing: Standardizing the data\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from adjustText import adjust_text\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Transposing the DataFrame\n",
    "df_numerical = pivot_df.T\n",
    "\n",
    "# Standardizing the data\n",
    "scaler = StandardScaler()\n",
    "df_scaled = scaler.fit_transform(df_numerical)\n",
    "\n",
    "# Applying PCA\n",
    "pca = PCA(n_components=2)  # using 2 components for visualization purposes\n",
    "principal_components = pca.fit_transform(df_scaled)\n",
    "\n",
    "# Creating a DataFrame for the PCA results\n",
    "pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])\n",
    "\n",
    "# Adding the phylum information back for interpretation\n",
    "pca_df['dataset'] = list(pivot_df.T.index)\n",
    "\n",
    "# Visualization\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])\n",
    "\n",
    "# Prepare a list to store the text objects for adjust_text\n",
    "texts = []\n",
    "for i, txt in enumerate(pca_df['dataset']):\n",
    "    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))\n",
    "\n",
    "# Using adjust_text to prevent overlapping\n",
    "adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))\n",
    "\n",
    "# After adjust_text, add bounding boxes to the texts\n",
    "for text in texts:\n",
    "    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))\n",
    "\n",
    "# Adding labels and title with the variable explained variance\n",
    "plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')\n",
    "plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')\n",
    "plt.title('Bracken analysis\\nPCA on '+tax_levels[bracken_level]+\" level with database \"+k2_database)\n",
    "plt.grid(True)\n",
    "\n",
    "# Show the plot\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bracken_plot",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}