Amplicon retrieval#
This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.
Data retrieval using iBridges#
This page assumes you have already installed iBridges and created an irods json file. Check the iBridges python client page for the latest information on how to install, authenticate, and retrieve data from iBridges.
Amplicon data retrieval#
Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.
%pip install ibridges ipywidgets pandas biom-format --quiet
Note: you may need to restart the kernel to use updated packages.
Authentication and download of ASV / SEQ / MET / TAX files
# We
from ibridges.interactive import interactive_auth
from ibridges import search_data
from ibridges import download
import concurrent.futures
from tqdm.notebook import tqdm
data_files = set()
def download_item(session, item):
download(session, item, "data/", overwrite=True)
data_files.add("data/" + item.name)
def parallel_download(session, results, max_workers=10):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Wrap the results with tqdm for progress tracking
list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc="Downloading"))
# Example usage:
session = interactive_auth()
# Searching for files in the iRODS collection
results = search_data(session, path_pattern="%project/%benchmark%/%_asv.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_seq.tsv")
results += search_data(session, path_pattern="%project/%benchmark%/%_met.tsv")
results += search_data(session,path_pattern="%project/%benchmark%/%_tax.tsv")
# Star the download process
parallel_download(session, results)
Combining the ASV, SEQ, MET and TAX files
destinations = {
"seq.tsv" : open("seq.tsv", "w"),
"asv.tsv" : open("asv.tsv", "w"),
"tax.tsv" : open("tax.tsv", "w"),
"met.tsv" : open("met.tsv", "w")
}
# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
print("Processing ", index, end="\r")
id = data_file.split("_")[-1]
content = open("./" + data_file).read()
print(content, file=destinations[id])
# Close all destination files
for destination in destinations:
destinations[destination].close()
Processing 197
Cleaning the taxonomy table
# Process taxonomy
import pandas as pd
lines = []
for line in open("tax.tsv"):
line = line.strip().split("\t")
if len(line) < 8:
line = line[:8] + ["NA"]*(8 - len(line))
formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")
print(tax_df.head())
taxonomy
ID
0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca... k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...
02a5d30774a4aa2bb68b37ad053084e4a8756498945c258... k__Bacteria; p__Thermodesulfobacteriota; c__De...
032dc4f2f5b3511282069f4220eae7e27135aee75489b41... k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...
0612b5106089c3769793ee4294d675a3f6a82c090708710... k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...
06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e... k__Bacteria; p__Bacillota; c__Clostridia; o__L...
Transforming the files into matrices
# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
line = line.strip().split("\t")
# Skip header
if len(line) == 1: continue
# Add to lines
lines.append(line)
# Row sample
size["nRow"].add(line[1])
# Column ASV
size["nCol"].add(line[0])
# Three columns to matrix method
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]
# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index
# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
if index % 10000 == 0:
print(index, len(lines), end="\r")
# Clean the sample, asv and value
sample, asv, value = line
sample = sample.strip()
asv = asv.strip()
value = value.strip()
# Set the value to the matrix using the sample and asv as index
asv_df.loc[asv, sample]=value
50000 57861
Combining the matrices
# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)
# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
row_names[index] = "ASV_" + str(index + 1)
result.index = row_names
# Write to file
result.to_csv("merged.tsv", sep="\t")
# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
print(line.strip(), file=output)
/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
result.fillna(0, inplace=True)
Combining the matrices and saving the BIOM file
# Load biom and json
from biom import load_table
import json
# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")
# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()
# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
line = line.strip()
# Skip empty lines
if "\t" not in line: continue
identifier, key, value = line.split("\t")
# Bug in biom reader, all metadata need to have the same keys in the same order
keys.add(key)
# Skip lines that are not in this biom object
if identifier not in sample_ids: continue
if identifier not in metadata:
metadata[identifier] = {}
metadata[identifier][key] = str(value)
# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
for key in keys:
if key not in metadata[identifier]:
metadata[identifier][key] = "None"
# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")
# Create Python object from JSON string data
obj = json.loads(json_data)
# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")
# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)
# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
Writing biom file to merged.biom