Bracken taxonomic abundances#

Below is an example on how to analyse bracken data of multiple samples.

Note

Please note that this document is a work in progress.

%pip install ibridges matplotlib adjustText scikit-learn --quiet

Data retrieval using iBridges#

This page assumes you have already installed iBridges and created an irods json file. Check the iBridges python client page for the latest information on how to install, authenticate, and retrieve data from iBridges.

Set the INVESTIGATION and STUDY variables to the corresponding names of your Investigation and Study.

This will download the Bracken files from the iRODS instance

from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download
import os
import concurrent.futures
from tqdm.notebook import tqdm

data_files = set()

def download_item(session, item):
    local_path = os.path.join("data", item.name)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)  # Ensure directories exist
    download(session, item, local_path, overwrite=True)
    data_files.add(local_path)

def parallel_download(session, results, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Wrap the results with tqdm for progress tracking
        list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))

# Example usage:
session = interactive_auth()

# Set investigation and study variables. Investigation name corresponds to the folder located at /unlock/home/wur.<name> and study to a subfolder /stu_<name>
investigation = "SET_INVESTIGATION" 
study = "SET_STUDY"

# Set Kraken2 database or bracken level to filter for those specific versions
k2_database = "%"
bracken_level = "%"

print("Searching in group",investigation,"with study",study,"....\n")
# Searching for files in the iRODS collection
results = search_data(session, path_pattern=f"%wur.{investigation}/stu_{study}%/%_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
# Start the download process
parallel_download(session, results)

Set filters to analyse a subset of the files#

# Function to get the sample name from the filename
def get_sample_name(filename,run_id, k2_database,bracken_level):
    sample_name = filename.removeprefix(run_id+"_") #filename.lstrip(runID+"_")
    sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
    return(sample_name)

tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}

STUDY = "stu_bmock12"
run_id = "ReadTaxonomy"

k2_database = "k2_pluspfp_20240112"
#k2_database = "kraken2_db_uhgg_v2.0.2"
bracken_level = "S"

# "filtered" or "unfiltered"
filtered = "filtered"

filtered_paths = list(filter(lambda path: STUDY in path and "_"+filtered+"_" in path and k2_database in path and bracken_level+".txt" in path, data_files))

print("sample name\tfilename\tfull_location")
for file_path in filtered_paths:
    filename = file_path.split("/")[-1]
    sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)
    sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
    print(sample_name+"\t"+filename+"\t"+file_path)

Concatenate the files containing tabular information into a single file for further processing#

import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Initialize an empty DataFrame
merged_df = pd.DataFrame()

# Loop over each file path in the filtered list
for file_path in filtered_paths:
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
    
    # Get clean filename to get the sample name
    filename = file_path.split("/")[-1]
    sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)
    # Add the filename as a column in the DataFrame
    df['sample'] = sample_name
    
    # Reset the index of the DataFrame to avoid duplicate indices
    # 'drop=True' prevents the old indices from being added as a new column
    # 'inplace=True' makes the change in the existing DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Concatenate the current DataFrame with the merged DataFrame
    # 'ignore_index=True' resets the index in the final DataFrame
    merged_df = pd.concat([merged_df, df], ignore_index=True)
    
# Multiple the abundance column by 100 to get percentages
merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100

Creating the plot with an abundance threshold#

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator

# Make a table of the fractions and the samples
extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]

# When the abundance is below the set value, change the species/genus/family... to 'Other'
abundance_threshold = 1
# Apply abundance filter
extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'

# Identify the top 20 names (because we cant have >20 distinct colors)
# Sort the DataFrame by 'fraction_total_reads' in descending order
extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)
top_20_names = extracted_df_sorted['name'].unique()[:20]
# Replace names not in the top 20 with 'Other'
extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'

# Sum the abundance for each species and sample
pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)
# Remove rows with all zeros
pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]

# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
print(sums)
# Round the sums to 1 decimal places
sums = [round(x, 0) for x in sums]
# Check if the sums are 100
assert sums == [100.0]*len(sums)

# Transpose and sort 
pivot_df_sorted_T = pivot_df.T.sort_index()

# Set a custom column and color
custom_column = 'Other'
custom_colors = {'Other': 'lightgrey'}
# Move the custom column to the last position in the DataFrame
pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]

# Set a colormap
cmap = plt.get_cmap('tab20')
# Plot a stacked barchart with custom color
ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, 
                     color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])

# Angle the x labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
# Set the y-axis ticks from 0 to 100 in steps of 10
ax.yaxis.set_major_locator(MultipleLocator(10))
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x)))  # optional: to remove decimal points

# XY Label names 
ax.set_xlabel('Sample', fontsize=16)
ax.set_ylabel('Fraction', fontsize=16)
# Set title
ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+" level with database "+k2_database, fontsize=16)
# Move legend outside the plot
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])

PCA plot#

# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA

# Transposing the DataFrame
df_numerical = pivot_df.T

# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

# Applying PCA
pca = PCA(n_components=2)  # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])

# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))

# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# After adjust_text, add bounding boxes to the texts
for text in texts:
    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))

# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('Bracken analysis\nPCA on '+tax_levels[bracken_level]+" level with database "+k2_database)
plt.grid(True)

# Show the plot
plt.show()