{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Processing Amplicon Long Read Data\n",
    "\n",
    "Below is an example on how to analyse the amplicon data obtained from long read sequencing...\n",
    "\n",
    "```{note}\n",
    "Please note that this document is a work in progress.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install matplotlib\n",
    "# !pip install seaborn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Connection settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.session import iRODSSession\n",
    "import ssl\n",
    "from irods.meta import iRODSMeta\n",
    "import os, sys\n",
    "\n",
    "# iRODS authentication information\n",
    "host = \"unlock-icat.irods.surfsara.nl\"\n",
    "port = 1247\n",
    "zone = \"unlock\"\n",
    "user = os.getenv('irodsUserName')\n",
    "password = os.getenv('irodsPassword')\n",
    "\n",
    "# SSL settings\n",
    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
    "\n",
    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
    "                'irods_encryption_key_size': 32,\n",
    "                'irods_encryption_num_hash_rounds': 16,\n",
    "                'irods_encryption_salt_size': 8,\n",
    "                \"irods_authentication_scheme\": \"pam\",\n",
    "                'ssl_context': context}\n",
    "\n",
    "print(\"Authenticating to iRODS as... \" + user)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data retrieval\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.models import Collection, DataObject\n",
    "from irods.column import Criterion\n",
    "import os\n",
    "\n",
    "# Group identifier\n",
    "GROUP = \"wur.srt_fermentation\"\n",
    "\n",
    "# Standard authentication\n",
    "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n",
    "    # Set the filters to select the files needed for downloading\n",
    "    results = session.query(Collection, DataObject).filter( \\\n",
    "        Criterion('like', Collection.name, '%'+GROUP+'%')).filter( \\\n",
    "        Criterion('like', DataObject.name, '%_rel-abundance.tsv'))\n",
    "    \n",
    "    retrieved = set()\n",
    "    for index, r in enumerate(results):\n",
    "        # Download the result files\n",
    "        data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n",
    "        # Original path in irods is local path from current location\n",
    "        if not os.path.exists(\".\" + data_file):\n",
    "            os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n",
    "            print(\"Downloading: \", index, data_file, end=\"\\r\")\n",
    "            session.data_objects.get(data_file, \".\" + data_file)\n",
    "        else:\n",
    "            print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")\n",
    "        retrieved.add(\".\" + data_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# List all files in the unlock directory recursively\n",
    "print(\"Files in unlock directory:\")\n",
    "\n",
    "# Concatenate the files containg tabular information into a single file for further processing\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import glob\n",
    "\n",
    "pd.set_option('display.max_rows', 500)\n",
    "pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.width', 1000)\n",
    "\n",
    "# Initialize an empty DataFrame\n",
    "merged_df = pd.DataFrame()\n",
    "\n",
    "# Loop over each file path in the 'retrieved' list\n",
    "for file_path in retrieved:\n",
    "    # Read the file into a DataFrame\n",
    "    df = pd.read_csv(file_path, sep='\\t', index_col=None, header=0)\n",
    "    \n",
    "    # Add the filename as a column in the DataFrame\n",
    "    df['filename'] = file_path.split(\"/\")[-1].split(\"_amplicon\")[0]\n",
    "    \n",
    "    # Reset the index of the DataFrame to avoid duplicate indices\n",
    "    # 'drop=True' prevents the old indices from being added as a new column\n",
    "    # 'inplace=True' makes the change in the existing DataFrame\n",
    "    df.reset_index(drop=True, inplace=True)\n",
    "    \n",
    "    # Concatenate the current DataFrame with the merged DataFrame\n",
    "    # 'ignore_index=True' resets the index in the final DataFrame\n",
    "    merged_df = pd.concat([merged_df, df], ignore_index=True)\n",
    "    \n",
    "print(merged_df)\n",
    "\n",
    "# Multiple the abundance column by 100 to get percentages\n",
    "merged_df['abundance'] = merged_df['abundance'] * 100\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Barplot the species abundance for each filename\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Assuming each DataFrame in 'li' has a column 'species' and 'abundance'\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "plt.figure(figsize=(10,6))\n",
    "sns.barplot(x='phylum', y='abundance', hue='filename', data=merged_df, errorbar=None)\n",
    "plt.title('Phylum Abundance for Each File')\n",
    "plt.xlabel('phylum')\n",
    "plt.ylabel('Abundance')\n",
    "plt.xticks(rotation=90)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Make a table of the phylum, the abundance and the filename\n",
    "extracted_df = merged_df[['species', 'abundance', 'filename']]\n",
    "\n",
    "# Sum the abundance for each phylum and filename\n",
    "pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)\n",
    "\n",
    "# Sum the columns\n",
    "sums = pivot_df.sum(axis=0)\n",
    "# This should be 100 for each column\n",
    "sums = list(sums)\n",
    "# Round the sums to 2 decimal places\n",
    "sums = [round(x, 2) for x in sums]\n",
    "# Check if the sums are 100\n",
    "assert sums == [100.0]*len(sums)\n",
    "\n",
    "# Plot the transposed pivot table\n",
    "pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Make a table of the phylum, the abundance and the filename\n",
    "extracted_df = merged_df[['species', 'abundance', 'filename']]\n",
    "\n",
    "# When the abundance is below 5, change the species to 'Other'\n",
    "extracted_df.loc[extracted_df['abundance'] < 5, 'species'] = 'Other'\n",
    "# Sum the abundance for each species and filename\n",
    "pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)\n",
    "# Remove rows with all zeros\n",
    "pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]\n",
    "\n",
    "# Sum the columns\n",
    "sums = pivot_df.sum(axis=0)\n",
    "# This should be 100 for each column\n",
    "sums = list(sums)\n",
    "# Round the sums to 2 decimal places\n",
    "sums = [round(x, 2) for x in sums]\n",
    "# Check if the sums are 100\n",
    "assert sums == [100.0]*len(sums)\n",
    "\n",
    "# Plot the transposed pivot table\n",
    "pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing: Standardizing the data\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from adjustText import adjust_text\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Transposing the DataFrame\n",
    "df_numerical = pivot_df.T\n",
    "\n",
    "# Standardizing the data\n",
    "scaler = StandardScaler()\n",
    "df_scaled = scaler.fit_transform(df_numerical)\n",
    "\n",
    "# Applying PCA\n",
    "pca = PCA(n_components=2)  # using 2 components for visualization purposes\n",
    "principal_components = pca.fit_transform(df_scaled)\n",
    "\n",
    "# Creating a DataFrame for the PCA results\n",
    "pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])\n",
    "\n",
    "# Adding the phylum information back for interpretation\n",
    "pca_df['dataset'] = list(pivot_df.T.index)\n",
    "\n",
    "# Visualization\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])\n",
    "\n",
    "# Prepare a list to store the text objects for adjust_text\n",
    "texts = []\n",
    "for i, txt in enumerate(pca_df['dataset']):\n",
    "    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))\n",
    "\n",
    "# Using adjust_text to prevent overlapping\n",
    "adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))\n",
    "\n",
    "# After adjust_text, add bounding boxes to the texts\n",
    "for text in texts:\n",
    "    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))\n",
    "\n",
    "# Adding labels and title with the variable explained variance\n",
    "plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')\n",
    "plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')\n",
    "plt.title('PCA of Phylum Data')\n",
    "plt.grid(True)\n",
    "\n",
    "# Show the plot\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}