Processing Amplicon Long Read Data#
Below is an example on how to generate some basic stats and plots obtained from long read sequencing.
It uses the output of the tool:
Emu: species-level taxonomic abundance for full-length 16S read
treangenlab/emu
https://www.nature.com/articles/s41592-022-01520-4
Note
Please note that this document is a work in progress.
# !pip install python-irodsclient
# !pip install matplotlib
# !pip install pandas
# !pip install scikit-learn
# !pip install openpyxl
Connection settings#
from irods.session import iRODSSession
import ssl
from getpass import getpass
# iRODS authentication information
host = "data.m-unlock.nl"
port = 1247
zone = "unlock"
user = <SRAM USERNAME>
password = getpass()
# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
"irods_authentication_scheme": "native",
'ssl_context': context}
print("Authenticating to iRODS as... " + user)
Data retrieval#
This will download the relative abundance files generated by Emu
from irods.models import Collection, DataObject
from irods.column import Criterion
import os
# Group identifier # GROUP name corresponds to the folder located at /unlock/home/wur.<groupname>
GROUP = <GROUP>
# Use "%" as wildcard
STUDY = <STUDY>
print("Searching in group",GROUP,"with study",STUDY,"....\n")
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
# Set the filters to select the files needed for downloading
results = session.query(Collection, DataObject).filter( \
Criterion('like', Collection.name, '%'+GROUP+'%'+STUDY+'%')).filter( \
Criterion('like', DataObject.name, '%_rel-abundance.tsv'))
retrieved = set()
for index, r in enumerate(results):
# Download the result files
data_file = r[Collection.name] + "/" + r[DataObject.name]
# Original path in irods is local path from current location
if not os.path.exists("." + data_file):
os.makedirs("." + r[Collection.name], exist_ok=True)
print("Downloading: ", index, data_file, end="\r")
session.data_objects.get(data_file, "." + data_file)
else:
print("Already received: " + data_file + " "*10, end="\r")
retrieved.add("." + data_file)
print("\n\nTotal nr. files downloaded:",len(retrieved))
Create data frame from abundance files#
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Initialize an empty DataFrame
emu_df = pd.DataFrame()
# Loop over each file path in the 'retrieved' list
for file_path in retrieved:
# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
# Add the filename as a column in the DataFrame
df['sample'] = file_path.split("/")[-1].split("_rel-abundance.tsv")[0]
# Reset the index of the DataFrame to avoid duplicate indices
# 'drop=True' prevents the old indices from being added as a new column
# 'inplace=True' makes the change in the existing DataFrame
df.reset_index(drop=True, inplace=True)
# Concatenate the current DataFrame with the merged DataFrame
# 'ignore_index=True' resets the index in the final DataFrame
emu_df = pd.concat([emu_df, df], ignore_index=True)
# Multiply the abundance column by 100 to get percentages
emu_df['abundance'] = emu_df['abundance'] * 100
print(emu_df)
# Initiate filtered df
filtered_emu_df = pd.DataFrame()
Clean some of the sample names if needed#
# General substring removal / replacement
emu_df['sample'] = emu_df['sample'].str.replace('_native_16S', '', regex=False)
# Rename specific sample names
emu_df.loc[emu_df['sample'] == 'Water_control', 'sample'] = 'Water_control_1'
emu_df.loc[emu_df['sample'] == 'Water_control_assay_2_native_16S', 'sample'] = 'Water_control_2'
print(emu_df["sample"].unique())
Get some mapping counts#
(estimated) Read counts are based on likelihood probabilities and therefore may not be integer values. They are calculated as the product of estimated relative abundance and total classified reads.
## Unmapped and mapped unclassified read counts ##
unmapped_df = emu_df[emu_df['tax_id'].str.contains('unmapped', na=False)][['sample', 'estimated counts']]
unmapped_df = unmapped_df.rename(columns={'estimated counts': 'unmapped_count'})
mapped_unclassfied_df = emu_df[emu_df['tax_id'].str.contains('mapped_unclassified', na=False)][['sample', 'estimated counts']]
mapped_unclassfied_df = mapped_unclassfied_df.rename(columns={'estimated counts': 'mapped_unclassified_count'})
uncounts_df = unmapped_df.merge(mapped_unclassfied_df, on='sample', how='inner')
# Total readcounts
readcounts_df = emu_df.groupby('sample')['estimated counts'].sum().reset_index().rename(columns={'estimated counts': 'total_classified_readcount'})
# merge the two dataframes
readcounts_df = readcounts_df.merge(uncounts_df, on='sample', how='inner')
# calculate mapped
readcounts_df['percentage_classified'] = (((readcounts_df['total_classified_readcount']) /
(readcounts_df['total_classified_readcount'] + readcounts_df['unmapped_count'] + readcounts_df['unmapped_count'])) * 100).round(2)
print(readcounts_df)
# Write to tsv
readcounts_df.to_csv('Emu_16S_mappingstats.tsv', sep='\t', index=False)
# Or to excel
readcounts_df.to_excel('Emu_16S_mappingstats.xlsx', index=False)
Separate control, negative and water samples#
Adjust as per your experimental set-up (or skip when not applicable)
positive_controls = ['Positive_control_1', 'Positive_control_2']
negative_controls = ['PCR_negative_control']
water_controls = ["Water_control_1","Water_control_2"]
# Create separate dataframe for positive, negative and water controls
positive_df = emu_df[emu_df['sample'].isin(positive_controls)]
negatives_df = emu_df[emu_df['sample'].isin(negative_controls)]
water_df = emu_df[emu_df['sample'].isin(water_controls)]
# Remove these samples from the full dataframe
filtered_emu_df = emu_df[~emu_df['sample'].isin(positive_controls+negative_controls+water_controls)]
# Print remaining samples
for sample in filtered_emu_df['sample'].unique(): print(sample)
print("\nTotal nr samples:",len(filtered_emu_df['sample'].unique()))
# Filter some more
#emu_df = emu_df[emu_df['sample'].str.contains('sup', na=False)]
Create stacked barchart per taxonomy level#
import matplotlib.pyplot as plt
#import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator
#filtered_emu_df = positive_df
taxon_levels = ["species","genus","family","order","class","phylum"]
for level in taxon_levels:
# Make a table of the abundances and the samples on the level specified above
# Check if data has been filtered or not
if filtered_emu_df.empty:
extracted_df = emu_df[[level, 'abundance', 'sample']]
else:
extracted_df = filtered_emu_df[[level, 'abundance', 'sample']]
# Emu leaves taxon levels blank when there is None. We give these a "unassigned_taxon_level" value.
extracted_df.fillna('unassigned_taxon_level', inplace=True)
# When the abundance is below the set value, change the species/genus/family... to 'Other'
abundance_threshold = 0.1
# Apply abundance filter
extracted_df.loc[extracted_df['abundance'] < abundance_threshold, level] = 'Other'
# Identify the top 20 names (because we can't have >20 distinct colors)
# Sort the DataFrame by 'abundance' in descending order
extracted_df_sorted = extracted_df.sort_values(by='abundance', ascending=False)
top_20_names = extracted_df_sorted[level].unique()[:20]
# Replace names not in the top 20 with 'Other'
if len(extracted_df_sorted[level].unique()) > 20:
extracted_df.loc[~extracted_df[level].isin(top_20_names), level] = 'Other'
# Sum the abundance for each species and sample
pivot_df = extracted_df.pivot_table(index=level, columns='sample', values='abundance', aggfunc='sum', fill_value=0)
# Remove rows with all zeros
pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]
# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
#print(sums)
# Round the sums to 1 decimal places
sums = [round(x, 0) for x in sums]
# Check if the sums are 100
#assert sums == [100.0]*len(sums)
# Transpose and sort
pivot_df_sorted_T = pivot_df.T.sort_index()
# Set a custom column for the Other group and color if present
if len(extracted_df_sorted[level].unique()) > 20:
custom_column = 'Other'
custom_colors = {'Other': 'lightgrey'}
# Move the custom column to the last position in the DataFrame
pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]
# Set a colormap
cmap = plt.get_cmap('tab20')
# Plot a stacked barchart with custom color
ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7,
color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])
# Angle the x labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
# Set the y-axis ticks from 0 to 100 in steps of 10
ax.yaxis.set_major_locator(MultipleLocator(10))
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x))) # optional: to remove decimal points
# XY Label names
ax.set_xlabel('Sample', fontsize=16)
ax.set_ylabel('Abundance', fontsize=16)
# Set title
ax.set_title('Emu 16S classification and abundances on '+level, fontsize=16)
# Move legend outside the plot
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=level)
plt.tight_layout()
# plt.figure(figsize=(5, 5))
#plt.savefig('16S_Emu_barplot_'+level+'.png')
PCA plot per taxon level#
# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA
# Transposing the DataFrame
df_numerical = pivot_df.T
df_numerical = pivot_df_sorted_T
# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)
# Applying PCA
pca = PCA(n_components=2) # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)
# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)
# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])
# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))
# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))
# After adjust_text, add bounding boxes to the texts
for text in texts:
text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))
# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('PCA of '+level+' Data')
plt.grid(True)
# Show the plot
plt.show()