Bracken taxonomic abundances#
Below is an example on how to analyse bracken data of multiple samples.
Note
Please note that this document is a work in progress.
# !pip install python-irodsclient
# !pip install matplotlib
# !pip install adjustText
# !pip install scikit-learn
Connection settings#
from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys
from getpass import getpass
# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = 1247
zone = "unlock"
user = "your SRAM username"
password = getpass()
# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
"irods_authentication_scheme": "pam",
'ssl_context': context}
print("Authenticating to iRODS as... " + user)
Download data files#
from irods.models import Collection, DataObject
from irods.column import Criterion
import os
# Which datafiles do you want to retrieve
# Denote them as they are in iRODS use "%" for all
GROUP = "wur.fdp"
STUDY = "%"
run_id = "ReadTaxonomy"
k2_database = "%"
bracken_level = "%"
## Download files
file_filter = "_filtered_"+k2_database+"_bracken_"+bracken_level+".txt"
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
# Set the filters to select the files needed for downloading
results = session.query(Collection, DataObject).filter( \
Criterion('like', Collection.name, '%'+GROUP+'/'+STUDY+'%')).filter( \
Criterion('like', DataObject.name, '%_'+file_filter))
retrieved = set()
for index, r in enumerate(results):
# Download the result files
data_file = r[Collection.name] + "/" + r[DataObject.name]
# Original path in irods is local path from current location
if not os.path.exists("." + data_file):
os.makedirs("." + r[Collection.name], exist_ok=True)
print("Downloading: ", index, data_file, end="\r")
session.data_objects.get(data_file, "." + data_file)
else:
print("Already received: " + data_file + " "*10, end="\r")
retrieved.add("." + data_file)
Set filters to analyse a subset of the files#
# Function to get the sample name from the filename
def get_sample_name(filename,run_id, k2_database,bracken_level):
sample_name = filename.removeprefix(run_id+"_") #filename.lstrip(runID+"_")
sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
return(sample_name)
tax_levels = {'R1': 'Kingdom','D': 'Domain','P': 'Phylum','C': 'Class','O': 'Order','F': 'Family','G': 'Genus','S': 'Species','S1': 'Strain'}
STUDY = "stu_bmock12"
run_id = "ReadTaxonomy"
k2_database = "k2_pluspfp_20240112"
#k2_database = "kraken2_db_uhgg_v2.0.2"
bracken_level = "S"
# "filtered" or "unfiltered"
filtered = "filtered"
filtered_paths = list(filter(lambda path: STUDY in path and "_"+filtered+"_" in path and k2_database in path and bracken_level+".txt" in path, retrieved))
print("sample name\tfilename\tfull_location")
for file_path in filtered_paths:
filename = file_path.split("/")[-1]
sample_name = get_sample_name(filename,run_id, k2_database,bracken_level)
sample_name = sample_name.removesuffix("_illumina_filtered_"+k2_database+"_bracken_"+bracken_level+".txt")
print(sample_name+"\t"+filename+"\t"+file_path)
Concatenate the files containing tabular information into a single file for further processing#
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Initialize an empty DataFrame
merged_df = pd.DataFrame()
# Loop over each file path in the filtered list
for file_path in filtered_paths:
# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
# Get clean filename to get the sample name
filename = file_path.split("/")[-1]
sample_name = get_sample_name(filename,run_id,k2_database,bracken_level)
# Add the filename as a column in the DataFrame
df['sample'] = sample_name
# Reset the index of the DataFrame to avoid duplicate indices
# 'drop=True' prevents the old indices from being added as a new column
# 'inplace=True' makes the change in the existing DataFrame
df.reset_index(drop=True, inplace=True)
# Concatenate the current DataFrame with the merged DataFrame
# 'ignore_index=True' resets the index in the final DataFrame
merged_df = pd.concat([merged_df, df], ignore_index=True)
# Multiple the abundance column by 100 to get percentages
merged_df['fraction_total_reads'] = merged_df['fraction_total_reads'] * 100
Creating the plot with an abundance threshold#
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator
# Make a table of the fractions and the samples
extracted_df = merged_df[['name', 'fraction_total_reads', 'sample']]
# When the abundance is below the set value, change the species/genus/family... to 'Other'
abundance_threshold = 1
# Apply abundance filter
extracted_df.loc[extracted_df['fraction_total_reads'] < abundance_threshold, 'name'] = 'Other'
# Identify the top 20 names (because we cant have >20 distinct colors)
# Sort the DataFrame by 'fraction_total_reads' in descending order
extracted_df_sorted = extracted_df.sort_values(by='fraction_total_reads', ascending=False)
top_20_names = extracted_df_sorted['name'].unique()[:20]
# Replace names not in the top 20 with 'Other'
extracted_df.loc[~extracted_df['name'].isin(top_20_names), 'name'] = 'Other'
# Sum the abundance for each species and sample
pivot_df = extracted_df.pivot_table(index='name', columns='sample', values='fraction_total_reads', aggfunc='sum', fill_value=0)
# Remove rows with all zeros
pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]
# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
print(sums)
# Round the sums to 1 decimal places
sums = [round(x, 0) for x in sums]
# Check if the sums are 100
assert sums == [100.0]*len(sums)
# Transpose and sort
pivot_df_sorted_T = pivot_df.T.sort_index()
# Set a custom column and color
custom_column = 'Other'
custom_colors = {'Other': 'lightgrey'}
# Move the custom column to the last position in the DataFrame
pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]
# Set a colormap
cmap = plt.get_cmap('tab20')
# Plot a stacked barchart with custom color
ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7,
color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])
# Angle the x labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
# Set the y-axis ticks from 0 to 100 in steps of 10
ax.yaxis.set_major_locator(MultipleLocator(10))
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x))) # optional: to remove decimal points
# XY Label names
ax.set_xlabel('Sample', fontsize=16)
ax.set_ylabel('Fraction', fontsize=16)
# Set title
ax.set_title('Bracken abundances on '+tax_levels[bracken_level]+" level with database "+k2_database, fontsize=16)
# Move legend outside the plot
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=tax_levels[bracken_level])
PCA plot#
# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA
# Transposing the DataFrame
df_numerical = pivot_df.T
# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)
# Applying PCA
pca = PCA(n_components=2) # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)
# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)
# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])
# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))
# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))
# After adjust_text, add bounding boxes to the texts
for text in texts:
text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))
# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('Bracken analysis\nPCA on '+tax_levels[bracken_level]+" level with database "+k2_database)
plt.grid(True)
# Show the plot
plt.show()