Amplicon BIOM#

This document contains many functions for all kinds of amplicon analysis used in unlock

Connection settings#

Set in the system environment the following variables:

  1. irodsUserName

  2. irodsPassword

from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys

# iRODS authentication information
host = "data.m-unlock.nl"
port = 1247
zone = "unlock"
user = os.getenv('irodsUserName')
password = os.getenv('irodsPassword')

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "pam_password",
                'ssl_context': context}
                

Amplicon#

Amplicon data retrieval#

Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '/unlock/home/wur.unlock.hivghana_drp0070%_PHYLOSEQ'))
    
    data_files = set()
    for index, r in enumerate(results):
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        data_files.add(data_file)
    for index, file_path in enumerate(data_files):
        # Original path in irods is local path from current location
        folder_path = os.path.dirname(file_path)
        if not os.path.exists("." + file_path):
            os.makedirs("." + folder_path, exist_ok=True)
            print("Downloading: " + str(index) + " of " + str(len(data_files)) + " " + file_path, end="\r")
            session.data_objects.get(file_path, "." + file_path)
        else:
            print("Already received: " + file_path + " "*10, end="\r")
destinations = {
    "seq.tsv" : open("seq.tsv", "w"), 
    "asv.tsv" : open("asv.tsv", "w"),
    "tax.tsv" : open("tax.tsv", "w"),
    "met.tsv" : open("met.tsv", "w")
}

# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
    print("Processing ", index, end="\r")
    id = data_file.split("_")[-1]
    content = open("." + data_file).read()
    print(content, file=destinations[id])

# Close all destination files
for destination in destinations:
    destinations[destination].close()
# Process taxonomy
import pandas as pd

lines = []
for line in open("tax.tsv"):
    line = line.strip().split("\t")
    if len(line) < 8:
        line = line[:8] + ["NA"]*(8 - len(line))
    formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
    lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")

print(tax_df.head())
# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
    line = line.strip().split("\t")
    # Skip header
    if len(line) == 1: continue
    # Add to lines
    lines.append(line)
    # Row sample
    size["nRow"].add(line[1])
    # Column ASV
    size["nCol"].add(line[0])

# Three columns to matrix method    
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]

# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index

# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
    if index % 10000 == 0:
        print(index, len(lines), end="\r")
    # Clean the sample, asv and value
    sample, asv, value = line
    sample = sample.strip()
    asv = asv.strip()
    value = value.strip()
    # Set the value to the matrix using the sample and asv as index
    asv_df.loc[asv, sample]=value
# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)

# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
    row_names[index] = "ASV_" + str(index + 1)
result.index = row_names

# Write to file
result.to_csv("merged.tsv", sep="\t")

# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
    print(line.strip(), file=output)
# Load biom and json
from biom import load_table
import json

# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")

# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()

# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
    line = line.strip()
    # Skip empty lines
    if "\t" not in line: continue
    identifier, key, value = line.split("\t")
    # Bug in biom reader, all metadata need to have the same keys in the same order
    keys.add(key)
    # Skip lines that are not in this biom object
    if identifier not in sample_ids: continue
    if identifier not in metadata:
        metadata[identifier] = {}
    metadata[identifier][key] = str(value)

# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
    for key in keys:
        if key not in metadata[identifier]:
            metadata[identifier][key] = "None"

# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")

# Create Python object from JSON string data
obj = json.loads(json_data)

# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")

# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)

# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))

Amplicon#

Picrust data retrieval#

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '%STU_BO3B-BR1%')).filter( \
        Criterion('like', DataObject.name, '%_predicted.tsv.gz'))
    
    for r in results:
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        # Original path in irods is local path from current location
        if not os.path.exists("." + data_file):
            os.makedirs("." + r[Collection.name], exist_ok=True)
            print("Downloading: " + data_file, end="\r")
            session.data_objects.get(data_file, "." + data_file)
        else:
            print("Already received: " + data_file + " "*10, end="\r")