Processing Amplicon Long Read Data#

Below is an example on how to analyse the amplicon data obtained from long read sequencing…

Note

Please note that this document is a work in progress.

# !pip install matplotlib
# !pip install seaborn

Connection settings#

from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys

# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = 1247
zone = "unlock"
user = os.getenv('irodsUserName')
password = os.getenv('irodsPassword')

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "pam",
                'ssl_context': context}

print("Authenticating to iRODS as... " + user)

Data retrieval#

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Group identifier
GROUP = "wur.srt_fermentation"

# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '%'+GROUP+'%')).filter( \
        Criterion('like', DataObject.name, '%_rel-abundance.tsv'))
    
    retrieved = set()
    for index, r in enumerate(results):
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        # Original path in irods is local path from current location
        if not os.path.exists("." + data_file):
            os.makedirs("." + r[Collection.name], exist_ok=True)
            print("Downloading: ", index, data_file, end="\r")
            session.data_objects.get(data_file, "." + data_file)
        else:
            print("Already received: " + data_file + " "*10, end="\r")
        retrieved.add("." + data_file)

Data processing#

# List all files in the unlock directory recursively
print("Files in unlock directory:")

# Concatenate the files containg tabular information into a single file for further processing
import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Initialize an empty DataFrame
merged_df = pd.DataFrame()

# Loop over each file path in the 'retrieved' list
for file_path in retrieved:
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep='\t', index_col=None, header=0)
    
    # Add the filename as a column in the DataFrame
    df['filename'] = file_path.split("/")[-1].split("_amplicon")[0]
    
    # Reset the index of the DataFrame to avoid duplicate indices
    # 'drop=True' prevents the old indices from being added as a new column
    # 'inplace=True' makes the change in the existing DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Concatenate the current DataFrame with the merged DataFrame
    # 'ignore_index=True' resets the index in the final DataFrame
    merged_df = pd.concat([merged_df, df], ignore_index=True)
    
print(merged_df)

# Multiple the abundance column by 100 to get percentages
merged_df['abundance'] = merged_df['abundance'] * 100
# Barplot the species abundance for each filename
import matplotlib.pyplot as plt

# Assuming each DataFrame in 'li' has a column 'species' and 'abundance'
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.barplot(x='phylum', y='abundance', hue='filename', data=merged_df, errorbar=None)
plt.title('Phylum Abundance for Each File')
plt.xlabel('phylum')
plt.ylabel('Abundance')
plt.xticks(rotation=90)
plt.show()
import matplotlib.pyplot as plt

# Make a table of the phylum, the abundance and the filename
extracted_df = merged_df[['species', 'abundance', 'filename']]

# Sum the abundance for each phylum and filename
pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)

# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
# Round the sums to 2 decimal places
sums = [round(x, 2) for x in sums]
# Check if the sums are 100
assert sums == [100.0]*len(sums)

# Plot the transposed pivot table
pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))
import matplotlib.pyplot as plt

# Make a table of the phylum, the abundance and the filename
extracted_df = merged_df[['species', 'abundance', 'filename']]

# When the abundance is below 5, change the species to 'Other'
extracted_df.loc[extracted_df['abundance'] < 5, 'species'] = 'Other'
# Sum the abundance for each species and filename
pivot_df = extracted_df.pivot_table(index='species', columns='filename', values='abundance', aggfunc='sum', fill_value=0)
# Remove rows with all zeros
pivot_df = pivot_df.loc[(pivot_df != 0).any(axis=1)]

# Sum the columns
sums = pivot_df.sum(axis=0)
# This should be 100 for each column
sums = list(sums)
# Round the sums to 2 decimal places
sums = [round(x, 2) for x in sums]
# Check if the sums are 100
assert sums == [100.0]*len(sums)

# Plot the transposed pivot table
pivot_df.T.plot(kind='bar', stacked=True, figsize=(10,7))
# Preprocessing: Standardizing the data
from sklearn.preprocessing import StandardScaler
from adjustText import adjust_text
from sklearn.decomposition import PCA

# Transposing the DataFrame
df_numerical = pivot_df.T

# Standardizing the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

# Applying PCA
pca = PCA(n_components=2)  # using 2 components for visualization purposes
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding the phylum information back for interpretation
pca_df['dataset'] = list(pivot_df.T.index)

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['Principal Component 1'], pca_df['Principal Component 2'])

# Prepare a list to store the text objects for adjust_text
texts = []
for i, txt in enumerate(pca_df['dataset']):
    texts.append(plt.text(pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i], txt))

# Using adjust_text to prevent overlapping
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# After adjust_text, add bounding boxes to the texts
for text in texts:
    text.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black', boxstyle='round,pad=0.5'))

# Adding labels and title with the variable explained variance
plt.xlabel('Principal Component 1 (' + str(round(pca.explained_variance_ratio_[0]*100, 2)) + '%)')
plt.ylabel('Principal Component 2 (' + str(round(pca.explained_variance_ratio_[1]*100, 2)) + '%)')
plt.title('PCA of Phylum Data')
plt.grid(True)

# Show the plot
plt.show()