Amplicon BIOM#
This document contains many functions for all kinds of amplicon analysis used in unlock
Connection settings#
Set in the system environment the following variables:
irodsUserName
irodsPassword
from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys
# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = 1247
zone = "unlock"
user = os.getenv('irodsUserName')
password = os.getenv('irodsPassword')
# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)
ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
'irods_client_server_policy': 'CS_NEG_REQUIRE',
'irods_encryption_algorithm': 'AES-256-CBC',
'irods_encryption_key_size': 32,
'irods_encryption_num_hash_rounds': 16,
'irods_encryption_salt_size': 8,
"irods_authentication_scheme": "pam",
'ssl_context': context}
Amplicon#
Amplicon data retrieval#
Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.
from irods.models import Collection, DataObject
from irods.column import Criterion
import os
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
# Set the filters to select the files needed for downloading
results = session.query(Collection, DataObject).filter( \
Criterion('like', Collection.name, '/unlock/home/wur.unlock.hivghana_drp0070%_PHYLOSEQ'))
data_files = set()
for index, r in enumerate(results):
# Download the result files
data_file = r[Collection.name] + "/" + r[DataObject.name]
data_files.add(data_file)
for index, file_path in enumerate(data_files):
# Original path in irods is local path from current location
folder_path = os.path.dirname(file_path)
if not os.path.exists("." + file_path):
os.makedirs("." + folder_path, exist_ok=True)
print("Downloading: " + str(index) + " of " + str(len(data_files)) + " " + file_path, end="\r")
session.data_objects.get(file_path, "." + file_path)
else:
print("Already received: " + file_path + " "*10, end="\r")
Already received: /unlock/home/wur.unlock.hivghana_drp0070/STU_DRP007092/OBS_XDRS176960/SAM_DRS176960/amplicon/ASY_DRR243924/NGTAX_Silva138.1-picrust_100_DRR243924/4_PHYLOSEQ/DRR243924_seq.tsv
destinations = {
"seq.tsv" : open("seq.tsv", "w"),
"asv.tsv" : open("asv.tsv", "w"),
"tax.tsv" : open("tax.tsv", "w"),
"met.tsv" : open("met.tsv", "w")
}
# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
print("Processing ", index, end="\r")
id = data_file.split("_")[-1]
content = open("." + data_file).read()
print(content, file=destinations[id])
# Close all destination files
for destination in destinations:
destinations[destination].close()
Processing 35
# Process taxonomy
import pandas as pd
lines = []
for line in open("tax.tsv"):
line = line.strip().split("\t")
if len(line) < 8:
line = line[:8] + ["NA"]*(8 - len(line))
formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")
print(tax_df.head())
taxonomy
ID
k__NA; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA
01a0bec405b65f107a84eb1bdaf0ccbbcc177f487bab236... k__Bacteria; p__Firmicutes; c__Bacilli; o__RF3...
01dc299b033cd3ddc735f033312a8ca6bfdee54fc12c2f9... k__Bacteria; p__Firmicutes; c__Clostridia; o__...
02db5a224a4f92228a2e78129f5ce999bde8f9d40fba844... k__Bacteria; p__Firmicutes; c__Clostridia; o__...
04a66337670a05caa7ff58d5d9516acac7646344dea7356... k__Bacteria; p__Firmicutes; c__Clostridia; o__...
# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
line = line.strip().split("\t")
# Skip header
if len(line) == 1: continue
# Add to lines
lines.append(line)
# Row sample
size["nRow"].add(line[1])
# Column ASV
size["nCol"].add(line[0])
# Three columns to matrix method
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]
# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index
# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
if index % 10000 == 0:
print(index, len(lines), end="\r")
# Clean the sample, asv and value
sample, asv, value = line
sample = sample.strip()
asv = asv.strip()
value = value.strip()
# Set the value to the matrix using the sample and asv as index
asv_df.loc[asv, sample]=value
0 8800
# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)
# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
row_names[index] = "ASV_" + str(index + 1)
result.index = row_names
# Write to file
result.to_csv("merged.tsv", sep="\t")
# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
print(line.strip(), file=output)
# Load biom and json
from biom import load_table
import json
# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")
# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()
# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
line = line.strip()
# Skip empty lines
if "\t" not in line: continue
identifier, key, value = line.split("\t")
# Bug in biom reader, all metadata need to have the same keys in the same order
keys.add(key)
# Skip lines that are not in this biom object
if identifier not in sample_ids: continue
if identifier not in metadata:
metadata[identifier] = {}
metadata[identifier][key] = str(value)
# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
for key in keys:
if key not in metadata[identifier]:
metadata[identifier][key] = "None"
# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")
# Create Python object from JSON string data
obj = json.loads(json_data)
# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")
# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)
# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[17], line 2
1 # Load biom and json
----> 2 from biom import load_table
3 import json
5 # Convert merged.tsv to biom file
File ~/anaconda3/lib/python3.9/site-packages/biom/__init__.py:51
2 r"""
3 Quick start
4 ===========
(...)
41
42 """
43 # ----------------------------------------------------------------------------
44 # Copyright (c) 2011-2020, The BIOM Format Development Team.
45 #
(...)
48 # The full license is in the file COPYING.txt, distributed with this software.
49 # ----------------------------------------------------------------------------
---> 51 from .table import Table
52 from .parse import parse_biom_table as parse_table, load_table
53 from .util import __format_version__, __version__
File ~/anaconda3/lib/python3.9/site-packages/biom/table.py:195
190 from biom.util import (get_biom_format_version_string,
191 get_biom_format_url_string, flatten, natsort,
192 prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY,
193 __format_version__)
194 from biom.err import errcheck
--> 195 from ._filter import _filter
196 from ._transform import _transform
197 from ._subsample import _subsample
File ~/anaconda3/lib/python3.9/site-packages/biom/_filter.pyx:1, in init biom._filter()
ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
Amplicon#
Picrust data retrieval#
from irods.models import Collection, DataObject
from irods.column import Criterion
import os
# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
# Set the filters to select the files needed for downloading
results = session.query(Collection, DataObject).filter( \
Criterion('like', Collection.name, '%STU_BO3B-BR1%')).filter( \
Criterion('like', DataObject.name, '%_predicted.tsv.gz'))
for r in results:
# Download the result files
data_file = r[Collection.name] + "/" + r[DataObject.name]
# Original path in irods is local path from current location
if not os.path.exists("." + data_file):
os.makedirs("." + r[Collection.name], exist_ok=True)
print("Downloading: " + data_file, end="\r")
session.data_objects.get(data_file, "." + data_file)
else:
print("Already received: " + data_file + " "*10, end="\r")