Amplicon retrieval#
This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.
Data retrieval using iBridges#
Check the iBridges data retrieval page for the latest information on how to retrieve data from iBridges.
Amplicon data retrieval#
Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.
%pip install ibridges ipywidgets pandas biom-format --quiet
Note: you may need to restart the kernel to use updated packages.
Authentication and download of ASV / SEQ / MET / TAX files
# We assume you have already installed iBridges and created a irods json file
from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download
import concurrent.futures
from tqdm.notebook import tqdm
data_files = set()
def download_item(session, item):
download(session, item, "data/", overwrite=True)
data_files.add("data/" + item.name)
def parallel_download(session, results, max_workers=10):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Wrap the results with tqdm for progress tracking
list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))
# Example usage:
session = interactive_auth()
# Searching for files in the iRODS collection
results = search_data(session, path_pattern="%project/%benchmark%/%_asv.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_seq.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_met.tsv")
results += search_data(session,path_pattern="%project/%benchmark%/%_tax.tsv")
# Star the download process
parallel_download(session, results)
Comnbinging the ASV, SEQ, MET and TAX files
destinations = {
"seq.tsv" : open("seq.tsv", "w"),
"asv.tsv" : open("asv.tsv", "w"),
"tax.tsv" : open("tax.tsv", "w"),
"met.tsv" : open("met.tsv", "w")
}
# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
print("Processing ", index, end="\r")
id = data_file.split("_")[-1]
content = open("./" + data_file).read()
print(content, file=destinations[id])
# Close all destination files
for destination in destinations:
destinations[destination].close()
Processing 197
Cleaning the taxonomy table
# Process taxonomy
import pandas as pd
lines = []
for line in open("tax.tsv"):
line = line.strip().split("\t")
if len(line) < 8:
line = line[:8] + ["NA"]*(8 - len(line))
formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")
print(tax_df.head())
taxonomy
ID
0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca... k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...
02a5d30774a4aa2bb68b37ad053084e4a8756498945c258... k__Bacteria; p__Thermodesulfobacteriota; c__De...
032dc4f2f5b3511282069f4220eae7e27135aee75489b41... k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...
0612b5106089c3769793ee4294d675a3f6a82c090708710... k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...
06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e... k__Bacteria; p__Bacillota; c__Clostridia; o__L...
Transforming the files into matrices
# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
line = line.strip().split("\t")
# Skip header
if len(line) == 1: continue
# Add to lines
lines.append(line)
# Row sample
size["nRow"].add(line[1])
# Column ASV
size["nCol"].add(line[0])
# Three columns to matrix method
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]
# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index
# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
if index % 10000 == 0:
print(index, len(lines), end="\r")
# Clean the sample, asv and value
sample, asv, value = line
sample = sample.strip()
asv = asv.strip()
value = value.strip()
# Set the value to the matrix using the sample and asv as index
asv_df.loc[asv, sample]=value
50000 57861
Combining the matrices
# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)
# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
row_names[index] = "ASV_" + str(index + 1)
result.index = row_names
# Write to file
result.to_csv("merged.tsv", sep="\t")
# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
print(line.strip(), file=output)
/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
result.fillna(0, inplace=True)
Combining the matrices and saving the BIOM file
# Load biom and json
from biom import load_table
import json
# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")
# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()
# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
line = line.strip()
# Skip empty lines
if "\t" not in line: continue
identifier, key, value = line.split("\t")
# Bug in biom reader, all metadata need to have the same keys in the same order
keys.add(key)
# Skip lines that are not in this biom object
if identifier not in sample_ids: continue
if identifier not in metadata:
metadata[identifier] = {}
metadata[identifier][key] = str(value)
# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
for key in keys:
if key not in metadata[identifier]:
metadata[identifier][key] = "None"
# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")
# Create Python object from JSON string data
obj = json.loads(json_data)
# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")
# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)
# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
Writing biom file to merged.biom