Processing Amplicon Long Read Data

Processing Amplicon Long Read Data#

Below is an example on how to generate some basic stats and plots obtained from long read sequencing.

It uses the output of the tool:
Emu: species-level taxonomic abundance for full-length 16S read

treangenlab/emu
https://www.nature.com/articles/s41592-022-01520-4

Note

Please note that this document is a work in progress.

Check dependencies:#

try:
    import matplotlib
except ImportError:
    %pip install matplotlib

try:
    import pandas
except ImportError:
    %pip install pandas

try:
    import sklearn
except ImportError:
    %pip install scikit-learn

try:
    import openpyxl
except ImportError:
    %pip install openpyxl

try:
    import adjustText
except ImportError:
    %pip install adjustText

Data retrieval using iBridges#

This page assumes you have already installed iBridges and created an irods json file. Check the iBridges python client page for the latest information on how to install, authenticate, and retrieve data from iBridges.

Set the GROUP and STUDY variables to the corresponding names of your Investigation and Study.

This will download the relative abundance files generated by Emu

from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download
import os
import concurrent.futures
from tqdm.notebook import tqdm

data_files = set()

def download_item(session, item):
    local_path = os.path.join("data", item.name)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)  # Ensure directories exist
    download(session, item, local_path, overwrite=True)
    data_files.add(local_path)

def parallel_download(session, results, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Wrap the results with tqdm for progress tracking
        list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))

# Example usage:
session = interactive_auth()

# Set investigation and study variables. Investigation name corresponds to the folder located at /unlock/home/wur.<name> and study to a subfolder /stu_<name>
investigation = "SET_INVESTIGATION" 
study = "SET_STUDY"

print("Searching in group",investigation,"with study",study,"....\n")
# Searching for files in the iRODS collection
results = search_data(session, path_pattern=f"%wur.{investigation}/stu_{study}%/%_rel-abundance.tsv")
# Start the download process
parallel_download(session, results)

Create data frame from abundance files#

import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Initialize an empty DataFrame
emu_df = pd.DataFrame()

# Loop over each file path in the 'retrieved' list
for file_path in data_files:
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
    
    # Add the filename as a column in the DataFrame
    df['sample'] = file_path.split("/")[-1].split("_rel-abundance.tsv")[0]
    
    # Reset the index of the DataFrame to avoid duplicate indices
    # 'drop=True' prevents the old indices from being added as a new column
    # 'inplace=True' makes the change in the existing DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Concatenate the current DataFrame with the merged DataFrame
    # 'ignore_index=True' resets the index in the final DataFrame
    emu_df = pd.concat([emu_df, df], ignore_index=True)
    
# Multiply the abundance column by 100 to get percentages
emu_df['abundance'] = emu_df['abundance'] * 100
print(emu_df)
# Initiate filtered df
filtered_emu_df = pd.DataFrame()

Clean some of the sample names if needed#

# General substring removal / replacement
emu_df['sample'] = emu_df['sample'].str.replace('_native_16S', '', regex=False)

# Rename specific sample names
emu_df.loc[emu_df['sample'] == 'Water_control', 'sample'] = 'Water_control_1'
emu_df.loc[emu_df['sample'] == 'Water_control_assay_2_native_16S', 'sample'] = 'Water_control_2'

print(emu_df["sample"].unique())

Get some mapping counts#

(estimated) Read counts are based on likelihood probabilities and therefore may not be integer values. They are calculated as the product of estimated relative abundance and total classified reads.

## Unmapped and mapped unclassified read counts ##
unmapped_df = emu_df[emu_df['tax_id'].str.contains('unmapped', na=False)][['sample', 'estimated counts']]
unmapped_df = unmapped_df.rename(columns={'estimated counts': 'unmapped_count'})
mapped_unclassfied_df = emu_df[emu_df['tax_id'].str.contains('mapped_unclassified', na=False)][['sample', 'estimated counts']]
mapped_unclassfied_df = mapped_unclassfied_df.rename(columns={'estimated counts': 'mapped_unclassified_count'})

uncounts_df = unmapped_df.merge(mapped_unclassfied_df, on='sample', how='inner')

# Total readcounts
readcounts_df = emu_df.groupby('sample')['estimated counts'].sum().reset_index().rename(columns={'estimated counts': 'total_classified_readcount'})
# merge the two dataframes
readcounts_df = readcounts_df.merge(uncounts_df, on='sample', how='inner')

# calculate mapped
readcounts_df['percentage_classified'] = (((readcounts_df['total_classified_readcount']) / 
                     (readcounts_df['total_classified_readcount'] + readcounts_df['unmapped_count'] + readcounts_df['unmapped_count'])) * 100).round(2)
print(readcounts_df)

# Write to tsv
readcounts_df.to_csv('Emu_16S_mappingstats.tsv', sep='\t', index=False)
# Or to excel
readcounts_df.to_excel('Emu_16S_mappingstats.xlsx', index=False)

Separate control, negative and water samples#

Adjust as per your experimental set-up (or skip when not applicable)

positive_controls = ['Positive_control_1', 'Positive_control_2']
negative_controls = ['PCR_negative_control']
water_controls = ["Water_control_1","Water_control_2"]

# Create separate dataframe for positive, negative and water controls
positive_df = emu_df[emu_df['sample'].isin(positive_controls)]
negatives_df = emu_df[emu_df['sample'].isin(negative_controls)] 
water_df = emu_df[emu_df['sample'].isin(water_controls)] 

# Remove these samples from the full dataframe
filtered_emu_df = emu_df[~emu_df['sample'].isin(positive_controls+negative_controls+water_controls)]

# Print remaining samples
for sample in filtered_emu_df['sample'].unique(): print(sample)
print("\nTotal nr samples:",len(filtered_emu_df['sample'].unique()))


# Filter some more
#emu_df = emu_df[emu_df['sample'].str.contains('sup', na=False)]

Create stacked barchart per taxonomy level#

import matplotlib.pyplot as plt
#import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator

#filtered_emu_df = positive_df

taxon_levels = ["species","genus","family","order","class","phylum"]

for level in taxon_levels:
    # Make a table of the abundances and the samples on the level specified above
    # Check if data has been filtered or not
    if filtered_emu_df.empty:
        extracted_df = emu_df[[level, 'abundance', 'sample']]
    else:
        extracted_df = filtered_emu_df[[level, 'abundance', 'sample']]

    # Emu leaves taxon levels blank when there is None. We give these a "unassigned_taxon_level" value.
    extracted_df.fillna('unassigned_taxon_level', inplace=True)

    # When the abundance is below the set value, change the species/genus/family... to 'Other'
    abundance_threshold = 0.1
    # Apply abundance filter
    extracted_df.loc[extracted_df['abundance'] < abundance_threshold, level] = 'Other'

    # Identify the top 20 names (because we can't have >20 distinct colors)
    # Sort the DataFrame by 'abundance' in descending order
    extracted_df_sorted = extracted_df.sort_values(by='abundance', ascending=False)
    top_20_names = extracted_df_sorted[level].unique()[:20]

    # Replace names not in the top 20 with 'Other'
    if len(extracted_df_sorted[level].unique()) > 20:
        extracted_df.loc[~extracted_df[level].isin(top_20_names), level] = 'Other'

    # Sum the abundance for each species and sample
    pivot_df = extracted_df.pivot_table(index=level, columns='sample', values='abundance', aggfunc='sum', fill_value=0)
    # Remove rows with all zeros
    pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]

    # Sum the columns
    sums = pivot_df.sum(axis=0)
    # This should be 100 for each column
    sums = list(sums)
    #print(sums)
    # Round the sums to 1 decimal places
    sums = [round(x, 0) for x in sums]
    # Check if the sums are 100
    #assert sums == [100.0]*len(sums)

    # Transpose and sort 
    pivot_df_sorted_T = pivot_df.T.sort_index()

    # Set a custom column for the Other group and color if present
    if len(extracted_df_sorted[level].unique()) > 20:
        custom_column = 'Other'
        custom_colors = {'Other': 'lightgrey'}
        # Move the custom column to the last position in the DataFrame
        pivot_df_sorted_T = pivot_df_sorted_T[[col for col in pivot_df_sorted_T.columns if col != custom_column] + [custom_column]]

    # Set a colormap
    cmap = plt.get_cmap('tab20')
    # Plot a stacked barchart with custom color
    ax = pivot_df_sorted_T.plot(kind='bar', stacked=True, figsize=(20, 7), width=0.7, 
                        color=[custom_colors.get(col, cmap(i)) for i, col in enumerate(pivot_df_sorted_T.columns)])

    # Angle the x labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45,ha='right')
    # Set the y-axis ticks from 0 to 100 in steps of 10
    ax.yaxis.set_major_locator(MultipleLocator(10))
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: int(x)))  # optional: to remove decimal points

    # XY Label names 
    ax.set_xlabel('Sample', fontsize=16)
    ax.set_ylabel('Abundance', fontsize=16)
    # Set title
    ax.set_title('Emu 16S classification and abundances on '+level, fontsize=16)
    # Move legend outside the plot
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title=level)


    plt.tight_layout()
    # plt.figure(figsize=(5, 5))
    
    #plt.savefig('16S_Emu_barplot_'+level+'.png')

PCA plot per taxon level#

# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA

# Transposing the DataFrame
df_numerical = pivot_df.T

df_numerical = pivot_df_sorted_T

# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

# Applying PCA
pca = PCA(n_components=2)  # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])

# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))

# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# After adjust_text, add bounding boxes to the texts
for text in texts:
    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))

# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('PCA of '+level+' Data')
plt.grid(True)

# Show the plot
plt.show()