Amplicon retrieval#

This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.

Data retrieval using iBridges#

This page assumes you have already installed iBridges and created an irods json file. Check the iBridges python client page for the latest information on how to install, authenticate, and retrieve data from iBridges.

Amplicon data retrieval#

Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.

%pip install ibridges ipywidgets pandas biom-format --quiet
Note: you may need to restart the kernel to use updated packages.

Authentication and download of ASV / SEQ / MET / TAX files

# We 
from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download

import concurrent.futures
from tqdm.notebook import tqdm

data_files = set()

def download_item(session, item):
    download(session, item, "data/", overwrite=True)
    data_files.add("data/" + item.name)

def parallel_download(session, results, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Wrap the results with tqdm for progress tracking
        list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))

# Example usage:
session = interactive_auth()

# Searching for files in the iRODS collection
results = search_data(session, path_pattern="%project/%benchmark%/%_asv.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_seq.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_met.tsv")
results += search_data(session,path_pattern="%project/%benchmark%/%_tax.tsv")
# Star the download process
parallel_download(session, results)

Combining the ASV, SEQ, MET and TAX files

destinations = {
    "seq.tsv" : open("seq.tsv", "w"), 
    "asv.tsv" : open("asv.tsv", "w"),
    "tax.tsv" : open("tax.tsv", "w"),
    "met.tsv" : open("met.tsv", "w")
}

# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
    print("Processing ", index, end="\r")
    id = data_file.split("_")[-1]
    content = open("./" + data_file).read()
    print(content, file=destinations[id])

# Close all destination files
for destination in destinations:
    destinations[destination].close()
Processing  197

Cleaning the taxonomy table

# Process taxonomy
import pandas as pd

lines = []
for line in open("tax.tsv"):
    line = line.strip().split("\t")
    if len(line) < 8:
        line = line[:8] + ["NA"]*(8 - len(line))
    formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
    lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")

print(tax_df.head())
                                                                                             taxonomy
ID                                                                                                   
0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca...  k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...
02a5d30774a4aa2bb68b37ad053084e4a8756498945c258...  k__Bacteria; p__Thermodesulfobacteriota; c__De...
032dc4f2f5b3511282069f4220eae7e27135aee75489b41...  k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...
0612b5106089c3769793ee4294d675a3f6a82c090708710...  k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...
06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e...  k__Bacteria; p__Bacillota; c__Clostridia; o__L...

Transforming the files into matrices

# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
    line = line.strip().split("\t")
    # Skip header
    if len(line) == 1: continue
    # Add to lines
    lines.append(line)
    # Row sample
    size["nRow"].add(line[1])
    # Column ASV
    size["nCol"].add(line[0])

# Three columns to matrix method    
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]

# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index

# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
    if index % 10000 == 0:
        print(index, len(lines), end="\r")
    # Clean the sample, asv and value
    sample, asv, value = line
    sample = sample.strip()
    asv = asv.strip()
    value = value.strip()
    # Set the value to the matrix using the sample and asv as index
    asv_df.loc[asv, sample]=value
50000 57861

Combining the matrices

# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)

# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
    row_names[index] = "ASV_" + str(index + 1)
result.index = row_names

# Write to file
result.to_csv("merged.tsv", sep="\t")

# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
    print(line.strip(), file=output)
/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  result.fillna(0, inplace=True)

Combining the matrices and saving the BIOM file

# Load biom and json
from biom import load_table
import json

# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")

# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()

# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
    line = line.strip()
    # Skip empty lines
    if "\t" not in line: continue
    identifier, key, value = line.split("\t")
    # Bug in biom reader, all metadata need to have the same keys in the same order
    keys.add(key)
    # Skip lines that are not in this biom object
    if identifier not in sample_ids: continue
    if identifier not in metadata:
        metadata[identifier] = {}
    metadata[identifier][key] = str(value)

# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
    for key in keys:
        if key not in metadata[identifier]:
            metadata[identifier][key] = "None"

# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")

# Create Python object from JSON string data
obj = json.loads(json_data)

# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")

# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)

# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
Writing biom file to merged.biom