Bracken taxonomic abundances

Contents

Bracken taxonomic abundances#

Below is an example on how to analyse bracken data of multiple samples.

Note

Please note that this document is a work in progress.

# !pip install python-irodsclient
# !pip install matplotlib
# !pip install adjustText
# !pip install scikit-learn

Connection settings#

from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys
from getpass import getpass

# iRODS authentication information
host = "data.m-unlock.nl"
port = 1247
zone = "unlock"
user = "your SRAM username"
password = getpass()

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "pam",
                'ssl_context': context}

print("Authenticating to iRODS as... " + user)

Download data files#

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Which datafiles do you want to retrieve

# Denote them as they are in iRODS use "%" for all
GROUP = "wur.fdp"
STUDY = "%"

run_id = "ReadTaxonomy"
k2_database = "%"
bracken_level = "%"

## Download files
file_filter = "_filtered_"+k2_database+"_bracken_"+bracken_level+".txt"
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '%'+GROUP+'/'+STUDY+'%')).filter( \
        Criterion('like', DataObject.name, '%_'+file_filter))
    
    retrieved = set()
    for index, r in enumerate(results):
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        # Original path in irods is local path from current location
        if not os.path.exists("." + data_file):
            os.makedirs("." + r[Collection.name], exist_ok=True)
            print("Downloading: ", index, data_file, end="\r")
            session.data_objects.get(data_file, "." + data_file)
        else:
            print("Already received: " + data_file + " "*10, end="\r")
        retrieved.add("." + data_file)

Set filters to analyse a subset of the files#

# Function to get the sample name from the filename
def get_sample_name(filename,run_id, k2_database,bracken_level):
    sample_name = filename.removeprefix(run_id+"_") #filename.lstrip(runID+"_")
    sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
    return(sample_name)

tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}

STUDY = "stu_bmock12"
run_id = "ReadTaxonomy"

k2_database = "k2_pluspfp_20240112"
#k2_database = "kraken2_db_uhgg_v2.0.2"
bracken_level = "S"

# "filtered" or "unfiltered"
filtered = "filtered"

filtered_paths = list(filter(lambda path: STUDY in path and "_"+filtered+"_" in path and k2_database in path and bracken_level+".txt" in path, retrieved))

print("sample name\tfilename\tfull_location")
for file_path in filtered_paths:
    filename = file_path.split("/")[-1]
    sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)
    sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
    print(sample_name+"\t"+filename+"\t"+file_path)

Concatenate the files containing tabular information into a single file for further processing#

import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Initialize an empty DataFrame
merged_df = pd.DataFrame()

# Loop over each file path in the filtered list
for file_path in filtered_paths:
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
    
    # Get clean filename to get the sample name
    filename = file_path.split("/")[-1]
    sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)
    # Add the filename as a column in the DataFrame
    df['sample'] = sample_name
    
    # Reset the index of the DataFrame to avoid duplicate indices
    # 'drop=True' prevents the old indices from being added as a new column
    # 'inplace=True' makes the change in the existing DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Concatenate the current DataFrame with the merged DataFrame
    # 'ignore_index=True' resets the index in the final DataFrame
    merged_df = pd.concat([merged_df, df], ignore_index=True)
    
# Multiple the abundance column by 100 to get percentages
merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100

Creating the plot with an abundance threshold#

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator

# Make a table of the fractions and the samples
extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]

# When the abundance is below the set value, change the species/genus/family... to 'Other'
abundance_threshold = 1
# Apply abundance filter
extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'

# Identify the top 20 names (because we cant have >20 distinct colors)
# Sort the DataFrame by 'fraction_total_reads' in descending order
extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)
top_20_names = extracted_df_sorted['name'].unique()[:20]
# Replace names not in the top 20 with 'Other'
extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'

# Sum the abundance for each species and sample
pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)
# Remove rows with all zeros
pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]

# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
print(sums)
# Round the sums to 1 decimal places
sums = [round(x, 0) for x in sums]
# Check if the sums are 100
assert sums == [100.0]*len(sums)

# Transpose and sort 
pivot_df_sorted_T = pivot_df.T.sort_index()

# Set a custom column and color
custom_column = 'Other'
custom_colors = {'Other': 'lightgrey'}
# Move the custom column to the last position in the DataFrame
pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]

# Set a colormap
cmap = plt.get_cmap('tab20')
# Plot a stacked barchart with custom color
ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, 
                     color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])

# Angle the x labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
# Set the y-axis ticks from 0 to 100 in steps of 10
ax.yaxis.set_major_locator(MultipleLocator(10))
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x)))  # optional: to remove decimal points

# XY Label names 
ax.set_xlabel('Sample', fontsize=16)
ax.set_ylabel('Fraction', fontsize=16)
# Set title
ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+" level with database "+k2_database, fontsize=16)
# Move legend outside the plot
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])

PCA plot#

# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA

# Transposing the DataFrame
df_numerical = pivot_df.T

# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

# Applying PCA
pca = PCA(n_components=2)  # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])

# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))

# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# After adjust_text, add bounding boxes to the texts
for text in texts:
    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))

# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('Bracken analysis\nPCA on '+tax_levels[bracken_level]+" level with database "+k2_database)
plt.grid(True)

# Show the plot
plt.show()