Processing Amplicon Long Read Data#

Below is an example on how to generate some basic stats and plots obtained from long read sequencing.

It uses the output of the tool:
Emu: species-level taxonomic abundance for full-length 16S read

treangenlab/emu
https://www.nature.com/articles/s41592-022-01520-4

Note

Please note that this document is a work in progress.

# !pip install python-irodsclient
# !pip install matplotlib
# !pip install pandas
# !pip install scikit-learn
# !pip install openpyxl

Connection settings#

from irods.session import iRODSSession
import ssl
from getpass import getpass

# iRODS authentication information
host = "data.m-unlock.nl"
port = 1247
zone = "unlock"
user = <SRAM USERNAME>
password = getpass()

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "native",
                'ssl_context': context}

print("Authenticating to iRODS as... " + user)

Data retrieval#

This will download the relative abundance files generated by Emu

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Group identifier # GROUP name corresponds to the folder located at /unlock/home/wur.<groupname>
GROUP = <GROUP>
# Use "%" as wildcard
STUDY = <STUDY> 

print("Searching in group",GROUP,"with study",STUDY,"....\n")
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '%'+GROUP+'%'+STUDY+'%')).filter( \
        Criterion('like', DataObject.name, '%_rel-abundance.tsv'))
    
    retrieved = set()
    for index, r in enumerate(results):
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        # Original path in irods is local path from current location
        if not os.path.exists("." + data_file):
            os.makedirs("." + r[Collection.name], exist_ok=True)
            print("Downloading: ", index, data_file, end="\r")
            session.data_objects.get(data_file, "." + data_file)
        else:
            print("Already received: " + data_file + " "*10, end="\r")
        retrieved.add("." + data_file)
print("\n\nTotal nr. files downloaded:",len(retrieved))

Create data frame from abundance files#

import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Initialize an empty DataFrame
emu_df = pd.DataFrame()

# Loop over each file path in the 'retrieved' list
for file_path in retrieved:
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
    
    # Add the filename as a column in the DataFrame
    df['sample'] = file_path.split("/")[-1].split("_rel-abundance.tsv")[0]
    
    # Reset the index of the DataFrame to avoid duplicate indices
    # 'drop=True' prevents the old indices from being added as a new column
    # 'inplace=True' makes the change in the existing DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Concatenate the current DataFrame with the merged DataFrame
    # 'ignore_index=True' resets the index in the final DataFrame
    emu_df = pd.concat([emu_df, df], ignore_index=True)
    
# Multiply the abundance column by 100 to get percentages
emu_df['abundance'] = emu_df['abundance'] * 100
print(emu_df)
# Initiate filtered df
filtered_emu_df = pd.DataFrame()

Clean some of the sample names if needed#

# General substring removal / replacement
emu_df['sample'] = emu_df['sample'].str.replace('_native_16S', '', regex=False)

# Rename specific sample names
emu_df.loc[emu_df['sample'] == 'Water_control', 'sample'] = 'Water_control_1'
emu_df.loc[emu_df['sample'] == 'Water_control_assay_2_native_16S', 'sample'] = 'Water_control_2'

print(emu_df["sample"].unique())

Get some mapping counts#

(estimated) Read counts are based on likelihood probabilities and therefore may not be integer values. They are calculated as the product of estimated relative abundance and total classified reads.

## Unmapped and mapped unclassified read counts ##
unmapped_df = emu_df[emu_df['tax_id'].str.contains('unmapped', na=False)][['sample', 'estimated counts']]
unmapped_df = unmapped_df.rename(columns={'estimated counts': 'unmapped_count'})
mapped_unclassfied_df = emu_df[emu_df['tax_id'].str.contains('mapped_unclassified', na=False)][['sample', 'estimated counts']]
mapped_unclassfied_df = mapped_unclassfied_df.rename(columns={'estimated counts': 'mapped_unclassified_count'})

uncounts_df = unmapped_df.merge(mapped_unclassfied_df, on='sample', how='inner')

# Total readcounts
readcounts_df = emu_df.groupby('sample')['estimated counts'].sum().reset_index().rename(columns={'estimated counts': 'total_classified_readcount'})
# merge the two dataframes
readcounts_df = readcounts_df.merge(uncounts_df, on='sample', how='inner')

# calculate mapped
readcounts_df['percentage_classified'] = (((readcounts_df['total_classified_readcount']) / 
                     (readcounts_df['total_classified_readcount'] + readcounts_df['unmapped_count'] + readcounts_df['unmapped_count'])) * 100).round(2)
print(readcounts_df)

# Write to tsv
readcounts_df.to_csv('Emu_16S_mappingstats.tsv', sep='\t', index=False)
# Or to excel
readcounts_df.to_excel('Emu_16S_mappingstats.xlsx', index=False)

Separate control, negative and water samples#

Adjust as per your experimental set-up (or skip when not applicable)

positive_controls = ['Positive_control_1', 'Positive_control_2']
negative_controls = ['PCR_negative_control']
water_controls = ["Water_control_1","Water_control_2"]

# Create separate dataframe for positive, negative and water controls
positive_df = emu_df[emu_df['sample'].isin(positive_controls)]
negatives_df = emu_df[emu_df['sample'].isin(negative_controls)] 
water_df = emu_df[emu_df['sample'].isin(water_controls)] 

# Remove these samples from the full dataframe
filtered_emu_df = emu_df[~emu_df['sample'].isin(positive_controls+negative_controls+water_controls)]

# Print remaining samples
for sample in filtered_emu_df['sample'].unique(): print(sample)
print("\nTotal nr samples:",len(filtered_emu_df['sample'].unique()))


# Filter some more
#emu_df = emu_df[emu_df['sample'].str.contains('sup', na=False)]

Create stacked barchart per taxonomy level#

import matplotlib.pyplot as plt
#import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator

#filtered_emu_df = positive_df

taxon_levels = ["species","genus","family","order","class","phylum"]

for level in taxon_levels:
    # Make a table of the abundances and the samples on the level specified above
    # Check if data has been filtered or not
    if filtered_emu_df.empty:
        extracted_df = emu_df[[level, 'abundance', 'sample']]
    else:
        extracted_df = filtered_emu_df[[level, 'abundance', 'sample']]

    # Emu leaves taxon levels blank when there is None. We give these a "unassigned_taxon_level" value.
    extracted_df.fillna('unassigned_taxon_level', inplace=True)

    # When the abundance is below the set value, change the species/genus/family... to 'Other'
    abundance_threshold = 0.1
    # Apply abundance filter
    extracted_df.loc[extracted_df['abundance'] < abundance_threshold, level] = 'Other'

    # Identify the top 20 names (because we can't have >20 distinct colors)
    # Sort the DataFrame by 'abundance' in descending order
    extracted_df_sorted = extracted_df.sort_values(by='abundance', ascending=False)
    top_20_names = extracted_df_sorted[level].unique()[:20]

    # Replace names not in the top 20 with 'Other'
    if len(extracted_df_sorted[level].unique()) > 20:
        extracted_df.loc[~extracted_df[level].isin(top_20_names), level] = 'Other'

    # Sum the abundance for each species and sample
    pivot_df = extracted_df.pivot_table(index=level, columns='sample', values='abundance', aggfunc='sum', fill_value=0)
    # Remove rows with all zeros
    pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]

    # Sum the columns
    sums = pivot_df.sum(axis=0)
    # This should be 100 for each column
    sums = list(sums)
    #print(sums)
    # Round the sums to 1 decimal places
    sums = [round(x, 0) for x in sums]
    # Check if the sums are 100
    #assert sums == [100.0]*len(sums)

    # Transpose and sort 
    pivot_df_sorted_T = pivot_df.T.sort_index()

    # Set a custom column for the Other group and color if present
    if len(extracted_df_sorted[level].unique()) > 20:
        custom_column = 'Other'
        custom_colors = {'Other': 'lightgrey'}
        # Move the custom column to the last position in the DataFrame
        pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]

    # Set a colormap
    cmap = plt.get_cmap('tab20')
    # Plot a stacked barchart with custom color
    ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, 
                        color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])

    # Angle the x labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
    # Set the y-axis ticks from 0 to 100 in steps of 10
    ax.yaxis.set_major_locator(MultipleLocator(10))
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x)))  # optional: to remove decimal points

    # XY Label names 
    ax.set_xlabel('Sample', fontsize=16)
    ax.set_ylabel('Abundance', fontsize=16)
    # Set title
    ax.set_title('Emu 16S classification and abundances on '+level, fontsize=16)
    # Move legend outside the plot
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=level)


    plt.tight_layout()
    # plt.figure(figsize=(5, 5))
    
    #plt.savefig('16S_Emu_barplot_'+level+'.png')

PCA plot per taxon level#

# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA

# Transposing the DataFrame
df_numerical = pivot_df.T

df_numerical = pivot_df_sorted_T

# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

# Applying PCA
pca = PCA(n_components=2)  # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])

# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))

# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# After adjust_text, add bounding boxes to the texts
for text in texts:
    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))

# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('PCA of '+level+' Data')
plt.grid(True)

# Show the plot
plt.show()