{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bracken taxonomic abundances\n", "\n", "Below is an example on how to analyse bracken data of multiple samples.\n", "\n", "```{note}\n", "Please note that this document is a work in progress.\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install python-irodsclient\n", "# !pip install matplotlib\n", "# !pip install adjustText\n", "# !pip install scikit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Connection settings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from irods.session import iRODSSession\n", "import ssl\n", "from irods.meta import iRODSMeta\n", "import os, sys\n", "from getpass import getpass\n", "\n", "# iRODS authentication information\n", "host = \"unlock-icat.irods.surfsara.nl\"\n", "port = 1247\n", "zone = \"unlock\"\n", "user = \"your SRAM username\"\n", "password = getpass()\n", "\n", "# SSL settings\n", "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n", "\n", "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n", " 'irods_client_server_policy': 'CS_NEG_REQUIRE',\n", " 'irods_encryption_algorithm': 'AES-256-CBC',\n", " 'irods_encryption_key_size': 32,\n", " 'irods_encryption_num_hash_rounds': 16,\n", " 'irods_encryption_salt_size': 8,\n", " \"irods_authentication_scheme\": \"pam\",\n", " 'ssl_context': context}\n", "\n", "print(\"Authenticating to iRODS as... \" + user)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download data files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from irods.models import Collection, DataObject\n", "from irods.column import Criterion\n", "import os\n", "\n", "# Which datafiles do you want to retrieve\n", "\n", "# Denote them as they are in iRODS use \"%\" for all\n", "GROUP = \"wur.fdp\"\n", "STUDY = \"%\"\n", "\n", "run_id = \"ReadTaxonomy\"\n", "k2_database = \"%\"\n", "bracken_level = \"%\"\n", "\n", "## Download files\n", "file_filter = \"_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\"\n", "# Standard authentication\n", "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n", " # Set the filters to select the files needed for downloading\n", " results = session.query(Collection, DataObject).filter( \\\n", " Criterion('like', Collection.name, '%'+GROUP+'/'+STUDY+'%')).filter( \\\n", " Criterion('like', DataObject.name, '%_'+file_filter))\n", " \n", " retrieved = set()\n", " for index, r in enumerate(results):\n", " # Download the result files\n", " data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n", " # Original path in irods is local path from current location\n", " if not os.path.exists(\".\" + data_file):\n", " os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n", " print(\"Downloading: \", index, data_file, end=\"\\r\")\n", " session.data_objects.get(data_file, \".\" + data_file)\n", " else:\n", " print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")\n", " retrieved.add(\".\" + data_file)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set filters to analyse a subset of the files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to get the sample name from the filename\n", "def get_sample_name(filename,run_id, k2_database,bracken_level):\n", " sample_name = filename.removeprefix(run_id+\"_\") #filename.lstrip(runID+\"_\")\n", " sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n", " return(sample_name)\n", "\n", "tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}\n", "\n", "STUDY = \"stu_bmock12\"\n", "run_id = \"ReadTaxonomy\"\n", "\n", "k2_database = \"k2_pluspfp_20240112\"\n", "#k2_database = \"kraken2_db_uhgg_v2.0.2\"\n", "bracken_level = \"S\"\n", "\n", "# \"filtered\" or \"unfiltered\"\n", "filtered = \"filtered\"\n", "\n", "filtered_paths = list(filter(lambda path: STUDY in path and \"_\"+filtered+\"_\" in path and k2_database in path and bracken_level+\".txt\" in path, retrieved))\n", "\n", "print(\"sample name\\tfilename\\tfull_location\")\n", "for file_path in filtered_paths:\n", " filename = file_path.split(\"/\")[-1]\n", " sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)\n", " sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n", " print(sample_name+\"\\t\"+filename+\"\\t\"+file_path)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenate the files containing tabular information into a single file for further processing" ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import glob\n", "\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 1000)\n", "\n", "# Initialize an empty DataFrame\n", "merged_df = pd.DataFrame()\n", "\n", "# Loop over each file path in the filtered list\n", "for file_path in filtered_paths:\n", " # Read the file into a DataFrame\n", " df = pd.read_csv(file_path, sep='\\t', index_col=None, header=0)\n", " \n", " # Get clean filename to get the sample name\n", " filename = file_path.split(\"/\")[-1]\n", " sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)\n", " # Add the filename as a column in the DataFrame\n", " df['sample'] = sample_name\n", " \n", " # Reset the index of the DataFrame to avoid duplicate indices\n", " # 'drop=True' prevents the old indices from being added as a new column\n", " # 'inplace=True' makes the change in the existing DataFrame\n", " df.reset_index(drop=True, inplace=True)\n", " \n", " # Concatenate the current DataFrame with the merged DataFrame\n", " # 'ignore_index=True' resets the index in the final DataFrame\n", " merged_df = pd.concat([merged_df, df], ignore_index=True)\n", " \n", "# Multiple the abundance column by 100 to get percentages\n", "merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating the plot with an abundance threshold" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib.cm as cm\n", "from matplotlib.ticker import MultipleLocator\n", "\n", "# Make a table of the fractions and the samples\n", "extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]\n", "\n", "# When the abundance is below the set value, change the species/genus/family... to 'Other'\n", "abundance_threshold = 1\n", "# Apply abundance filter\n", "extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'\n", "\n", "# Identify the top 20 names (because we cant have >20 distinct colors)\n", "# Sort the DataFrame by 'fraction_total_reads' in descending order\n", "extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)\n", "top_20_names = extracted_df_sorted['name'].unique()[:20]\n", "# Replace names not in the top 20 with 'Other'\n", "extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'\n", "\n", "# Sum the abundance for each species and sample\n", "pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)\n", "# Remove rows with all zeros\n", "pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]\n", "\n", "# Sum the columns\n", "sums = pivot_df.sum(axis=0)\n", "# This should be 100 for each column\n", "sums = list(sums)\n", "print(sums)\n", "# Round the sums to 1 decimal places\n", "sums = [round(x, 0) for x in sums]\n", "# Check if the sums are 100\n", "assert sums == [100.0]*len(sums)\n", "\n", "# Transpose and sort \n", "pivot_df_sorted_T = pivot_df.T.sort_index()\n", "\n", "# Set a custom column and color\n", "custom_column = 'Other'\n", "custom_colors = {'Other': 'lightgrey'}\n", "# Move the custom column to the last position in the DataFrame\n", "pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]\n", "\n", "# Set a colormap\n", "cmap = plt.get_cmap('tab20')\n", "# Plot a stacked barchart with custom color\n", "ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, \n", " color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])\n", "\n", "# Angle the x labels\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')\n", "# Set the y-axis ticks from 0 to 100 in steps of 10\n", "ax.yaxis.set_major_locator(MultipleLocator(10))\n", "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x))) # optional: to remove decimal points\n", "\n", "# XY Label names \n", "ax.set_xlabel('Sample', fontsize=16)\n", "ax.set_ylabel('Fraction', fontsize=16)\n", "# Set title\n", "ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+\" level with database \"+k2_database, fontsize=16)\n", "# Move legend outside the plot\n", "ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### PCA plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preprocessing: Standardizing the data\n", "from sklearn.preprocessing import StandardScaler\n", "from adjustText import adjust_text\n", "from sklearn.decomposition import PCA\n", "\n", "# Transposing the DataFrame\n", "df_numerical = pivot_df.T\n", "\n", "# Standardizing the data\n", "scaler = StandardScaler()\n", "df_scaled = scaler.fit_transform(df_numerical)\n", "\n", "# Applying PCA\n", "pca = PCA(n_components=2) # using 2 components for visualization purposes\n", "principal_components = pca.fit_transform(df_scaled)\n", "\n", "# Creating a DataFrame for the PCA results\n", "pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])\n", "\n", "# Adding the phylum information back for interpretation\n", "pca_df['dataset'] = list(pivot_df.T.index)\n", "\n", "# Visualization\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])\n", "\n", "# Prepare a list to store the text objects for adjust_text\n", "texts = []\n", "for i, txt in enumerate(pca_df['dataset']):\n", " texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))\n", "\n", "# Using adjust_text to prevent overlapping\n", "adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))\n", "\n", "# After adjust_text, add bounding boxes to the texts\n", "for text in texts:\n", " text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))\n", "\n", "# Adding labels and title with the variable explained variance\n", "plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')\n", "plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')\n", "plt.title('Bracken analysis\\nPCA on '+tax_levels[bracken_level]+\" level with database \"+k2_database)\n", "plt.grid(True)\n", "\n", "# Show the plot\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "bracken_plot", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 2 }