{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Processing Amplicon Long Read Data\n", "\n", "Below is an example on how to analyse the amplicon data obtained from long read sequencing...\n", "\n", "```{note}\n", "Please note that this document is a work in progress.\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install matplotlib\n", "# !pip install seaborn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Connection settings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from irods.session import iRODSSession\n", "import ssl\n", "from irods.meta import iRODSMeta\n", "import os, sys\n", "\n", "# iRODS authentication information\n", "host = \"unlock-icat.irods.surfsara.nl\"\n", "port = 1247\n", "zone = \"unlock\"\n", "user = os.getenv('irodsUserName')\n", "password = os.getenv('irodsPassword')\n", "\n", "# SSL settings\n", "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n", "\n", "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n", " 'irods_client_server_policy': 'CS_NEG_REQUIRE',\n", " 'irods_encryption_algorithm': 'AES-256-CBC',\n", " 'irods_encryption_key_size': 32,\n", " 'irods_encryption_num_hash_rounds': 16,\n", " 'irods_encryption_salt_size': 8,\n", " \"irods_authentication_scheme\": \"pam\",\n", " 'ssl_context': context}\n", "\n", "print(\"Authenticating to iRODS as... \" + user)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data retrieval\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from irods.models import Collection, DataObject\n", "from irods.column import Criterion\n", "import os\n", "\n", "# Group identifier\n", "GROUP = \"wur.srt_fermentation\"\n", "\n", "# Standard authentication\n", "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n", " # Set the filters to select the files needed for downloading\n", " results = session.query(Collection, DataObject).filter( \\\n", " Criterion('like', Collection.name, '%'+GROUP+'%')).filter( \\\n", " Criterion('like', DataObject.name, '%_rel-abundance.tsv'))\n", " \n", " retrieved = set()\n", " for index, r in enumerate(results):\n", " # Download the result files\n", " data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n", " # Original path in irods is local path from current location\n", " if not os.path.exists(\".\" + data_file):\n", " os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n", " print(\"Downloading: \", index, data_file, end=\"\\r\")\n", " session.data_objects.get(data_file, \".\" + data_file)\n", " else:\n", " print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")\n", " retrieved.add(\".\" + data_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data processing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# List all files in the unlock directory recursively\n", "print(\"Files in unlock directory:\")\n", "\n", "# Concatenate the files containg tabular information into a single file for further processing\n", "import pandas as pd\n", "import numpy as np\n", "import glob\n", "\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.width', 1000)\n", "\n", "# Initialize an empty DataFrame\n", "merged_df = pd.DataFrame()\n", "\n", "# Loop over each file path in the 'retrieved' list\n", "for file_path in retrieved:\n", " # Read the file into a DataFrame\n", " df = pd.read_csv(file_path, sep='\\t', index_col=None, header=0)\n", " \n", " # Add the filename as a column in the DataFrame\n", " df['filename'] = file_path.split(\"/\")[-1].split(\"_amplicon\")[0]\n", " \n", " # Reset the index of the DataFrame to avoid duplicate indices\n", " # 'drop=True' prevents the old indices from being added as a new column\n", " # 'inplace=True' makes the change in the existing DataFrame\n", " df.reset_index(drop=True, inplace=True)\n", " \n", " # Concatenate the current DataFrame with the merged DataFrame\n", " # 'ignore_index=True' resets the index in the final DataFrame\n", " merged_df = pd.concat([merged_df, df], ignore_index=True)\n", " \n", "print(merged_df)\n", "\n", "# Multiple the abundance column by 100 to get percentages\n", "merged_df['abundance'] = merged_df['abundance'] * 100\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Barplot the species abundance for each filename\n", "import matplotlib.pyplot as plt\n", "\n", "# Assuming each DataFrame in 'li' has a column 'species' and 'abundance'\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "plt.figure(figsize=(10,6))\n", "sns.barplot(x='phylum', y='abundance', hue='filename', data=merged_df, errorbar=None)\n", "plt.title('Phylum Abundance for Each File')\n", "plt.xlabel('phylum')\n", "plt.ylabel('Abundance')\n", "plt.xticks(rotation=90)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "# Make a table of the phylum, the abundance and the filename\n", "extracted_df = merged_df[['species', 'abundance', 'filename']]\n", "\n", "# Sum the abundance for each phylum and filename\n", "pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)\n", "\n", "# Sum the columns\n", "sums = pivot_df.sum(axis=0)\n", "# This should be 100 for each column\n", "sums = list(sums)\n", "# Round the sums to 2 decimal places\n", "sums = [round(x, 2) for x in sums]\n", "# Check if the sums are 100\n", "assert sums == [100.0]*len(sums)\n", "\n", "# Plot the transposed pivot table\n", "pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "# Make a table of the phylum, the abundance and the filename\n", "extracted_df = merged_df[['species', 'abundance', 'filename']]\n", "\n", "# When the abundance is below 5, change the species to 'Other'\n", "extracted_df.loc[extracted_df['abundance'] < 5, 'species'] = 'Other'\n", "# Sum the abundance for each species and filename\n", "pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)\n", "# Remove rows with all zeros\n", "pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]\n", "\n", "# Sum the columns\n", "sums = pivot_df.sum(axis=0)\n", "# This should be 100 for each column\n", "sums = list(sums)\n", "# Round the sums to 2 decimal places\n", "sums = [round(x, 2) for x in sums]\n", "# Check if the sums are 100\n", "assert sums == [100.0]*len(sums)\n", "\n", "# Plot the transposed pivot table\n", "pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preprocessing: Standardizing the data\n", "from sklearn.preprocessing import StandardScaler\n", "from adjustText import adjust_text\n", "from sklearn.decomposition import PCA\n", "\n", "# Transposing the DataFrame\n", "df_numerical = pivot_df.T\n", "\n", "# Standardizing the data\n", "scaler = StandardScaler()\n", "df_scaled = scaler.fit_transform(df_numerical)\n", "\n", "# Applying PCA\n", "pca = PCA(n_components=2) # using 2 components for visualization purposes\n", "principal_components = pca.fit_transform(df_scaled)\n", "\n", "# Creating a DataFrame for the PCA results\n", "pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])\n", "\n", "# Adding the phylum information back for interpretation\n", "pca_df['dataset'] = list(pivot_df.T.index)\n", "\n", "# Visualization\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])\n", "\n", "# Prepare a list to store the text objects for adjust_text\n", "texts = []\n", "for i, txt in enumerate(pca_df['dataset']):\n", " texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))\n", "\n", "# Using adjust_text to prevent overlapping\n", "adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))\n", "\n", "# After adjust_text, add bounding boxes to the texts\n", "for text in texts:\n", " text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))\n", "\n", "# Adding labels and title with the variable explained variance\n", "plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')\n", "plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')\n", "plt.title('PCA of Phylum Data')\n", "plt.grid(True)\n", "\n", "# Show the plot\n", "plt.show()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }