Amplicon retrieval#

This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.

Data retrieval using iBridges#

Check the iBridges data retrieval page for the latest information on how to retrieve data from iBridges.

Amplicon data retrieval#

Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.

%pip install ibridges ipywidgets pandas biom-format --quiet
Note: you may need to restart the kernel to use updated packages.

Authentication and download of ASV / SEQ / MET / TAX files

# We assume you have already installed iBridges and created a irods json file
from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download

import concurrent.futures
from tqdm.notebook import tqdm

data_files = set()

def download_item(session, item):
    download(session, item, "data/", overwrite=True)
    data_files.add("data/" + item.name)

def parallel_download(session, results, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Wrap the results with tqdm for progress tracking
        list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))

# Example usage:
session = interactive_auth()

# Searching for files in the iRODS collection
results = search_data(session, path_pattern="%project/%benchmark%/%_asv.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_seq.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_met.tsv")
results += search_data(session,path_pattern="%project/%benchmark%/%_tax.tsv")
# Star the download process
parallel_download(session, results)

Comnbinging the ASV, SEQ, MET and TAX files

destinations = {
    "seq.tsv" : open("seq.tsv", "w"), 
    "asv.tsv" : open("asv.tsv", "w"),
    "tax.tsv" : open("tax.tsv", "w"),
    "met.tsv" : open("met.tsv", "w")
}

# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
    print("Processing ", index, end="\r")
    id = data_file.split("_")[-1]
    content = open("./" + data_file).read()
    print(content, file=destinations[id])

# Close all destination files
for destination in destinations:
    destinations[destination].close()
Processing  197

Cleaning the taxonomy table

# Process taxonomy
import pandas as pd

lines = []
for line in open("tax.tsv"):
    line = line.strip().split("\t")
    if len(line) < 8:
        line = line[:8] + ["NA"]*(8 - len(line))
    formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
    lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")

print(tax_df.head())
                                                                                             taxonomy
ID                                                                                                   
0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca...  k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...
02a5d30774a4aa2bb68b37ad053084e4a8756498945c258...  k__Bacteria; p__Thermodesulfobacteriota; c__De...
032dc4f2f5b3511282069f4220eae7e27135aee75489b41...  k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...
0612b5106089c3769793ee4294d675a3f6a82c090708710...  k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...
06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e...  k__Bacteria; p__Bacillota; c__Clostridia; o__L...

Transforming the files into matrices

# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
    line = line.strip().split("\t")
    # Skip header
    if len(line) == 1: continue
    # Add to lines
    lines.append(line)
    # Row sample
    size["nRow"].add(line[1])
    # Column ASV
    size["nCol"].add(line[0])

# Three columns to matrix method    
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]

# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index

# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
    if index % 10000 == 0:
        print(index, len(lines), end="\r")
    # Clean the sample, asv and value
    sample, asv, value = line
    sample = sample.strip()
    asv = asv.strip()
    value = value.strip()
    # Set the value to the matrix using the sample and asv as index
    asv_df.loc[asv, sample]=value
50000 57861

Combining the matrices

# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)

# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
    row_names[index] = "ASV_" + str(index + 1)
result.index = row_names

# Write to file
result.to_csv("merged.tsv", sep="\t")

# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
    print(line.strip(), file=output)
/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  result.fillna(0, inplace=True)

Combining the matrices and saving the BIOM file

# Load biom and json
from biom import load_table
import json

# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")

# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()

# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
    line = line.strip()
    # Skip empty lines
    if "\t" not in line: continue
    identifier, key, value = line.split("\t")
    # Bug in biom reader, all metadata need to have the same keys in the same order
    keys.add(key)
    # Skip lines that are not in this biom object
    if identifier not in sample_ids: continue
    if identifier not in metadata:
        metadata[identifier] = {}
    metadata[identifier][key] = str(value)

# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
    for key in keys:
        if key not in metadata[identifier]:
            metadata[identifier][key] = "None"

# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")

# Create Python object from JSON string data
obj = json.loads(json_data)

# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")

# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)

# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
Writing biom file to merged.biom