Post-processing of HUMAnN data

Post-processing of HUMAnN data#

This section outlines how to generate some basic statistics and plots from the output of the HUMAnN analysis pipeline used in UNLOCK.

This section is written for output from HUMAnN 3.0.

Note

Please note that this document is a work in progress.

To follow this guide, install the following dependencies (if not installed yet):

try:
    !humann --version
except:
    %pip install humann

try:
    import matplotlib
except ImportError:
    %pip install matplotlib

try:
    import pandas
except ImportError:
    %pip install pandas

try:
    import seaborn
except ImportError:
    %pip install seaborn

try:
    import numpy
except ImportError:
    %pip install numpy

try:
    import scipy
except ImportError:
    %pip install scipy

Connection settings#

from irods.session import iRODSSession
import ssl
from getpass import getpass

# iRODS authentication information
host = "data.m-unlock.nl"
port = 1247
zone = "unlock"
user = "<SRAM USERNAME>"
password = getpass()

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "native",
                'ssl_context': context}

print("Authenticating to iRODS as... " + user)

irods_env_dir = "~/.irods"
irods_env_file = "irods_environment.json"


from pathlib import Path
from ibridges import Session
from getpass import getpass

env_loc = Path(irods_env_dir) / irods_env_file
env_file = Path.expanduser(Path(env_loc))

password = getpass()
session = Session(env_file, password=password)

if session:
    print("Session succesfully established")

Data retrieval#

This will download only the humann subfolders in your specific study

## set study and make download dir

from ibridges import IrodsPath
from pathlib import Path

# Set investigation and study
investigation = "gutbrain_axis"
study = "mind-set"

# Define where to download files locally - this creates a relative path from where this notebook is located
download_path = f"./unlock_downloads/{investigation}/{study}"

# Create the directory if it doesn't exist yet
download_dir = Path.expanduser(Path(download_path))
download_dir.mkdir( parents=True, exist_ok=True )

from ibridges import IrodsPath, search_data, download
from pathlib import Path
import os

# Recursively downloads all humann3 folders
folder_pattern="humann3"
# Define search path
search = f"/unlock/home/wur.{investigation}/stu_{study}/"
data = search_data(session, path=IrodsPath(session, search), path_pattern=folder_pattern)

# Set options and counters
overwrite = False
downloaded, skipped = 0,0
unique_folders = []

# Go through the search results and download all specified folders
for item in data:
    irods_path = IrodsPath(session, item) # Create an IrodsPath object for the item
    run = irods_path.parent.name # Extract the name of the parent folder
    local_destination = Path(download_path) / run     # Construct the local destination path
    if item.collection_exists():  # Only process directories (collections)
        if not local_destination.exists() or overwrite:
            local_destination.mkdir(parents=True, exist_ok=True)
            download(session, item, local_destination, overwrite=overwrite)
            downloaded += 1
        elif len(os.listdir(local_destination)) == 0:
            download(session, item, local_destination, overwrite=overwrite)
            downloaded += 1
        else:
            skipped += 1
    elif item not in unique_folders:  # Avoid downloading already processed items
        unique_folders.append(item)

# Print download summary
print("\nDownloaded: ", downloaded)
print("Skipped: ", skipped)
print("Total unique folders processed:", len(unique_folders))

Set study input folder#

Let’s set up the specific study input folder:

import os
# Set the input folder
input_folder = "path/to/the/input/data"

# Verify the path
if not os.path.exists(input_folder):
    print(f"Error: The folder '{input_folder}' does not exist.")
else:
    print(f"Using input folder: {os.path.abspath(input_folder)}")

output_folder = f"{input_folder}/output"
os.makedirs(output_folder, exist_ok=True)

Join tables#

We will merge the gene family and pathway abundance outputs into a single table, but first we need to flatten the folder structure by copying all relevant .tsv files into the same directory (by setting the corresponding suffixes). Note that we can use any of the variety of HUMAnN output files, below are just some illustrative examples but pick the normalization and regrouping methods best suited to your analysis:

import shutil
import glob

genefamily_suffix = "genefamilies-community-relab_go"
genefamily_files = glob.glob(os.path.join(input_folder, "**", f"*_{genefamily_suffix}.tsv"), recursive=True)
for file in genefamily_files:
    shutil.copy2(file, output_folder)

pathabundance_suffix = "pathabundance-community-cpm"
pathabundance_files = glob.glob(os.path.join(input_folder, "**", f"*_{pathabundance_suffix}.tsv"), recursive=True)
for file in pathabundance_files:
    shutil.copy2(file, output_folder)
    
print(f"Copied {len(genefamily_files)} genefamily files to {output_folder}")
print(f"Copied {len(pathabundance_files)} pathabundance files to {output_folder}")

Now, we can use the humann_join_tables script, this might take a while depending on the amount of files, especially for the genefamilies:

%%bash
!humann_join_tables --input "{output_folder}" \
--output "{output_folder}/{genefamily_suffix}_joined.tsv" \
--file_name {genefamily_suffix}

%%bash
!humann_join_tables --input "{output_folder}" \
--output "{output_folder}/{pathabundance_suffix}_joined.tsv" \
--file_name {pathabundance_suffix}

Let’s clean up the flattened folder structure:

for file in genefamily_files:
    try:
        os.remove(os.path.join(output_folder, os.path.basename(file)))
    except FileNotFoundError:
        pass

for file in pathabundance_files:
    try:
        os.remove(os.path.join(output_folder, os.path.basename(file)))
    except FileNotFoundError:
        pass

Now, we can create some basic visualizations based on our merged tsv files. Humann also contains a barplot function:

%%bash
!humann_barplot -h

We can use this barplot function to create a differentially abundant pathway png file in our output folder, it is prudent to add metadata to elucidate patterns in your data. See the HUMAnN manual for more information.

%%bash
!humann_barplot --input "{output_folder}/{pathabundance_suffix}_joined.tsv" \
--focal-feature "1CMET2-PWY" \
--sort sum \
--output "{output_folder}/barplot_{pathabundance_suffix}_joined.png"

We can take a glance at the most abundant gene families:

import pandas as pd
import matplotlib.pyplot as plt

# Load the merged HUMAnN output file
genefamily_file = rf"{output_folder}\{genefamily_suffix}_joined.tsv"
df = pd.read_csv(genefamily_file, sep="\t")

# Ensure the first column is treated as the index (features)
df.set_index(df.columns[0], inplace=True)

# Sum across samples (if multiple columns exist)
df["Total_Abundance"] = df.sum(axis=1)

# Select the top 10 most abundant features
top_features = df.nlargest(10, "Total_Abundance")

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_features.index, top_features["Total_Abundance"], color="skyblue")
plt.xlabel("Total Abundance")
plt.ylabel("Feature")
plt.title("Top 10 Most Abundant Features")
plt.gca().invert_yaxis()  # Highest at the top
plt.show()

We can also generate some global summary statistics:

import pandas as pd
import numpy as np

# Load the gene family abundance data
genefamily_file = rf"{output_folder}\{genefamily_suffix}_joined.tsv"
df = pd.read_csv(genefamily_file, sep='\t', index_col=0)

# Calculate global summary statistics
summary_stats = pd.DataFrame({
    "mean": [df.values.mean()],
    "std": [df.values.std()],
    "median": [np.median(df.values)],
    "min": [df.values.min()],
    "max": [df.values.max()]
})
print(summary_stats)

As well as statistics on the first 10 samples:

summary_stats = df.describe().T.rename(columns={'50%': 'median'})[['mean', 'std', 'median', 'min', 'max']]
summary_stats = summary_stats.head(10)  # or use .sample(10) for random
print(summary_stats)