{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bracken taxonomic abundances\n", "\n", "Below is an example on how to analyse bracken data of multiple samples.\n", "\n", "```{note}\n", "Please note that this document is a work in progress.\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install ibridges matplotlib adjustText scikit-learn --quiet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data retrieval using iBridges\n", "\n", "This page assumes you have already installed iBridges and created an irods json file. Check the [iBridges python client page](../storage/ibridges_python.ipynb) for the latest information on how to install, authenticate, and retrieve data from iBridges.\n", "\n", "Set the INVESTIGATION and STUDY variables to the corresponding names of your Investigation and Study.\n", "\n", "*This will download the Bracken files from the iRODS instance*\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ibridges.interactive import interactive_auth\n", "from ibridges import search_data\n", "from ibridges import download\n", "import os\n", "import concurrent.futures\n", "from tqdm.notebook import tqdm\n", "\n", "data_files = set()\n", "\n", "def download_item(session, item):\n", " local_path = os.path.join(\"data\", item.name)\n", " os.makedirs(os.path.dirname(local_path), exist_ok=True) # Ensure directories exist\n", " download(session, item, local_path, overwrite=True)\n", " data_files.add(local_path)\n", "\n", "def parallel_download(session, results, max_workers=10):\n", " with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n", " # Wrap the results with tqdm for progress tracking\n", " list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc=\"Downloading\"))\n", "\n", "# Example usage:\n", "session = interactive_auth()\n", "\n", "# Set investigation and study variables. Investigation name corresponds to the folder located at /unlock/home/wur. and study to a subfolder /stu_\n", "investigation = \"SET_INVESTIGATION\" \n", "study = \"SET_STUDY\"\n", "\n", "# Set Kraken2 database or bracken level to filter for those specific versions\n", "k2_database = \"%\"\n", "bracken_level = \"%\"\n", "\n", "print(\"Searching in group\",investigation,\"with study\",study,\"....\\n\")\n", "# Searching for files in the iRODS collection\n", "results = search_data(session, path_pattern=f\"%wur.{investigation}/stu_{study}%/%_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n", "# Start the download process\n", "parallel_download(session, results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set filters to analyse a subset of the files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to get the sample name from the filename\n", "def get_sample_name(filename,run_id, k2_database,bracken_level):\n", " sample_name = filename.removeprefix(run_id+\"_\") #filename.lstrip(runID+\"_\")\n", " sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n", " return(sample_name)\n", "\n", "tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}\n", "\n", "STUDY = \"stu_bmock12\"\n", "run_id = \"ReadTaxonomy\"\n", "\n", "k2_database = \"k2_pluspfp_20240112\"\n", "#k2_database = \"kraken2_db_uhgg_v2.0.2\"\n", "bracken_level = \"S\"\n", "\n", "# \"filtered\" or \"unfiltered\"\n", "filtered = \"filtered\"\n", "\n", "filtered_paths = list(filter(lambda path: STUDY in path and \"_\"+filtered+\"_\" in path and k2_database in path and bracken_level+\".txt\" in path, data_files))\n", "\n", "print(\"sample name\\tfilename\\tfull_location\")\n", "for file_path in filtered_paths:\n", " filename = file_path.split(\"/\")[-1]\n", " sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)\n", " sample_name = sample_name.removesuffix(\"_illumina_filtered_\"+k2_database+\"_bracken_\"+bracken_level+\".txt\")\n", " print(sample_name+\"\\t\"+filename+\"\\t\"+file_path)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concatenate the files containing tabular information into a single file for further processing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import glob\n", "\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 1000)\n", "\n", "# Initialize an empty DataFrame\n", "merged_df = pd.DataFrame()\n", "\n", "# Loop over each file path in the filtered list\n", "for file_path in filtered_paths:\n", " # Read the file into a DataFrame\n", " df = pd.read_csv(file_path, sep='\\t', index_col=None, header=0)\n", " \n", " # Get clean filename to get the sample name\n", " filename = file_path.split(\"/\")[-1]\n", " sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)\n", " # Add the filename as a column in the DataFrame\n", " df['sample'] = sample_name\n", " \n", " # Reset the index of the DataFrame to avoid duplicate indices\n", " # 'drop=True' prevents the old indices from being added as a new column\n", " # 'inplace=True' makes the change in the existing DataFrame\n", " df.reset_index(drop=True, inplace=True)\n", " \n", " # Concatenate the current DataFrame with the merged DataFrame\n", " # 'ignore_index=True' resets the index in the final DataFrame\n", " merged_df = pd.concat([merged_df, df], ignore_index=True)\n", " \n", "# Multiple the abundance column by 100 to get percentages\n", "merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating the plot with an abundance threshold" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib.cm as cm\n", "from matplotlib.ticker import MultipleLocator\n", "\n", "# Make a table of the fractions and the samples\n", "extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]\n", "\n", "# When the abundance is below the set value, change the species/genus/family... to 'Other'\n", "abundance_threshold = 1\n", "# Apply abundance filter\n", "extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'\n", "\n", "# Identify the top 20 names (because we cant have >20 distinct colors)\n", "# Sort the DataFrame by 'fraction_total_reads' in descending order\n", "extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)\n", "top_20_names = extracted_df_sorted['name'].unique()[:20]\n", "# Replace names not in the top 20 with 'Other'\n", "extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'\n", "\n", "# Sum the abundance for each species and sample\n", "pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)\n", "# Remove rows with all zeros\n", "pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]\n", "\n", "# Sum the columns\n", "sums = pivot_df.sum(axis=0)\n", "# This should be 100 for each column\n", "sums = list(sums)\n", "print(sums)\n", "# Round the sums to 1 decimal places\n", "sums = [round(x, 0) for x in sums]\n", "# Check if the sums are 100\n", "assert sums == [100.0]*len(sums)\n", "\n", "# Transpose and sort \n", "pivot_df_sorted_T = pivot_df.T.sort_index()\n", "\n", "# Set a custom column and color\n", "custom_column = 'Other'\n", "custom_colors = {'Other': 'lightgrey'}\n", "# Move the custom column to the last position in the DataFrame\n", "pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]\n", "\n", "# Set a colormap\n", "cmap = plt.get_cmap('tab20')\n", "# Plot a stacked barchart with custom color\n", "ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, \n", " color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])\n", "\n", "# Angle the x labels\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')\n", "# Set the y-axis ticks from 0 to 100 in steps of 10\n", "ax.yaxis.set_major_locator(MultipleLocator(10))\n", "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x))) # optional: to remove decimal points\n", "\n", "# XY Label names \n", "ax.set_xlabel('Sample', fontsize=16)\n", "ax.set_ylabel('Fraction', fontsize=16)\n", "# Set title\n", "ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+\" level with database \"+k2_database, fontsize=16)\n", "# Move legend outside the plot\n", "ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PCA plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preprocessing: Standardizing the data\n", "from sklearn.preprocessing import StandardScaler\n", "from adjustText import adjust_text\n", "from sklearn.decomposition import PCA\n", "\n", "# Transposing the DataFrame\n", "df_numerical = pivot_df.T\n", "\n", "# Standardizing the data\n", "scaler = StandardScaler()\n", "df_scaled = scaler.fit_transform(df_numerical)\n", "\n", "# Applying PCA\n", "pca = PCA(n_components=2) # using 2 components for visualization purposes\n", "principal_components = pca.fit_transform(df_scaled)\n", "\n", "# Creating a DataFrame for the PCA results\n", "pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])\n", "\n", "# Adding the phylum information back for interpretation\n", "pca_df['dataset'] = list(pivot_df.T.index)\n", "\n", "# Visualization\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])\n", "\n", "# Prepare a list to store the text objects for adjust_text\n", "texts = []\n", "for i, txt in enumerate(pca_df['dataset']):\n", " texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))\n", "\n", "# Using adjust_text to prevent overlapping\n", "adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))\n", "\n", "# After adjust_text, add bounding boxes to the texts\n", "for text in texts:\n", " text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))\n", "\n", "# Adding labels and title with the variable explained variance\n", "plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')\n", "plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')\n", "plt.title('Bracken analysis\\nPCA on '+tax_levels[bracken_level]+\" level with database \"+k2_database)\n", "plt.grid(True)\n", "\n", "# Show the plot\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Testing_nbs", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 }