{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Amplicon retrieval\n", "\n", "This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.\n", "\n", "## Data retrieval using iBridges\n", "\n", "This page assumes you have already installed iBridges and created an irods json file. Check the [iBridges python client page](../storage/ibridges_python.ipynb) for the latest information on how to install, authenticate, and retrieve data from iBridges.\n", "\n", "### Amplicon data retrieval\n", "Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%pip install ibridges ipywidgets pandas biom-format --quiet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Authentication and download of ASV / SEQ / MET / TAX files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d951d12b970843fc98deae981f37f122", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from ibridges.interactive import interactive_auth\n", "from ibridges import search_data\n", "from ibridges import download\n", "\n", "import concurrent.futures\n", "from tqdm.notebook import tqdm\n", "\n", "data_files = set()\n", "\n", "def download_item(session, item):\n", " download(session, item, \"data/\", overwrite=True)\n", " data_files.add(\"data/\" + item.name)\n", "\n", "def parallel_download(session, results, max_workers=10):\n", " with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n", " # Wrap the results with tqdm for progress tracking\n", " list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc=\"Downloading\"))\n", "\n", "# Example usage:\n", "session = interactive_auth()\n", "\n", "# Searching for files in the iRODS collection\n", "results = search_data(session, path_pattern=\"%project/%benchmark%/%_asv.tsv\")\n", "results += search_data(session, path_pattern=\"%project/%benchmark%/%_seq.tsv\")\n", "results += search_data(session, path_pattern=\"%project/%benchmark%/%_met.tsv\")\n", "results += search_data(session,path_pattern=\"%project/%benchmark%/%_tax.tsv\")\n", "# Start the download process\n", "parallel_download(session, results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combining the ASV, SEQ, MET and TAX files" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing 197\r" ] } ], "source": [ "destinations = {\n", " \"seq.tsv\" : open(\"seq.tsv\", \"w\"), \n", " \"asv.tsv\" : open(\"asv.tsv\", \"w\"),\n", " \"tax.tsv\" : open(\"tax.tsv\", \"w\"),\n", " \"met.tsv\" : open(\"met.tsv\", \"w\")\n", "}\n", "\n", "# Merge all tsv files into the appropiate file object\n", "for index, data_file in enumerate(data_files):\n", " print(\"Processing \", index, end=\"\\r\")\n", " id = data_file.split(\"_\")[-1]\n", " content = open(\"./\" + data_file).read()\n", " print(content, file=destinations[id])\n", "\n", "# Close all destination files\n", "for destination in destinations:\n", " destinations[destination].close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cleaning the taxonomy table" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " taxonomy\n", "ID \n", "0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca... k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...\n", "02a5d30774a4aa2bb68b37ad053084e4a8756498945c258... k__Bacteria; p__Thermodesulfobacteriota; c__De...\n", "032dc4f2f5b3511282069f4220eae7e27135aee75489b41... k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...\n", "0612b5106089c3769793ee4294d675a3f6a82c090708710... k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...\n", "06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e... k__Bacteria; p__Bacillota; c__Clostridia; o__L...\n" ] } ], "source": [ "# Process taxonomy\n", "import pandas as pd\n", "\n", "lines = []\n", "for line in open(\"tax.tsv\"):\n", " line = line.strip().split(\"\\t\")\n", " if len(line) < 8:\n", " line = line[:8] + [\"NA\"]*(8 - len(line))\n", " formatted = [line[0], \"k__\" + line[1]+ \"; p__\" + line[2]+ \"; c__\" + line[3]+ \"; o__\" + line[4]+ \"; f__\" + line[5]+ \"; g__\" + line[6]+ \"; s__\" + line[7]]\n", " lines.append(formatted)\n", "tax_df = pd.DataFrame(lines)\n", "tax_df.columns = [\"ID\", \"taxonomy\"]\n", "tax_df = tax_df.set_index(\"ID\")\n", "\n", "print(tax_df.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transforming the files into matrices" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50000 57861\r" ] } ], "source": [ "# Load and transform the ASV file into a matrix\n", "lines = []\n", "size = {\"nRow\": set(), \"nCol\": set()}\n", "# Read the asv.tsv file\n", "for line in open(\"asv.tsv\"):\n", " line = line.strip().split(\"\\t\")\n", " # Skip header\n", " if len(line) == 1: continue\n", " # Add to lines\n", " lines.append(line)\n", " # Row sample\n", " size[\"nRow\"].add(line[1])\n", " # Column ASV\n", " size[\"nCol\"].add(line[0])\n", "\n", "# Three columns to matrix method \n", "asv_df = pd.DataFrame(index=range(len(size[\"nRow\"])),columns=range(len(size[\"nCol\"])))\n", "asv_df.columns = size[\"nCol\"]\n", "\n", "# Sort the index\n", "asv_index = sorted(list(size[\"nRow\"]))\n", "asv_df.index = asv_index\n", "\n", "# Fill the matrix with the values from the lines\n", "for index, line in enumerate(lines):\n", " if index % 10000 == 0:\n", " print(index, len(lines), end=\"\\r\")\n", " # Clean the sample, asv and value\n", " sample, asv, value = line\n", " sample = sample.strip()\n", " asv = asv.strip()\n", " value = value.strip()\n", " # Set the value to the matrix using the sample and asv as index\n", " asv_df.loc[asv, sample]=value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combining the matrices" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " result.fillna(0, inplace=True)\n" ] } ], "source": [ "# Merge the ASV and taxonomy dataframes\n", "result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)\n", "# Remove duplicates\n", "result = result.drop_duplicates(keep=\"first\")\n", "# Fill NaN with 0\n", "result.fillna(0, inplace=True)\n", "\n", "# Rename the rownames\n", "row_names = result.index.values\n", "# Replace the rownames with ASV_1, ASV_2, ...\n", "for index, value in enumerate(row_names):\n", " row_names[index] = \"ASV_\" + str(index + 1)\n", "result.index = row_names\n", "\n", "# Write to file\n", "result.to_csv(\"merged.tsv\", sep=\"\\t\")\n", "\n", "# Load and fix first line\n", "lines = open(\"merged.tsv\").readlines()\n", "# Add header\n", "lines.insert(0, \"# Automatically generated input file for biom conversion\")\n", "# Add OTU ID header\n", "lines[1] = \"#OTU ID\" + lines[1]\n", "# Write to file\n", "output = open(\"merged.tsv\", \"w\")\n", "for line in lines:\n", " print(line.strip(), file=output)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combining the matrices and saving the BIOM file" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing biom file to merged.biom\n" ] } ], "source": [ "# Load biom and json\n", "from biom import load_table\n", "import json\n", "\n", "# Convert merged.tsv to biom file\n", "biom_content = load_table(\"merged.tsv\")\n", "\n", "# Load metadata file\n", "sample_ids = set(biom_content._sample_ids)\n", "metadata = {}\n", "keys = set()\n", "\n", "# Read the met.tsv file\n", "for index, line in enumerate(open(\"met.tsv\")):\n", " line = line.strip()\n", " # Skip empty lines\n", " if \"\\t\" not in line: continue\n", " identifier, key, value = line.split(\"\\t\")\n", " # Bug in biom reader, all metadata need to have the same keys in the same order\n", " keys.add(key)\n", " # Skip lines that are not in this biom object\n", " if identifier not in sample_ids: continue\n", " if identifier not in metadata:\n", " metadata[identifier] = {}\n", " metadata[identifier][key] = str(value)\n", "\n", "# Add missing keys to metadata using a sorted list\n", "keys = sorted(keys)\n", "# Add missing keys to metadata\n", "for identifier in metadata:\n", " for key in keys:\n", " if key not in metadata[identifier]:\n", " metadata[identifier][key] = \"None\"\n", "\n", "# Add metadata\n", "biom_content.add_metadata(metadata)\n", "# Set biom type\n", "biom_content.type = \"OTU table\"\n", "json_data = biom_content.to_json(generated_by=\"UNLOCK conversion module\")\n", "\n", "# Create Python object from JSON string data\n", "obj = json.loads(json_data)\n", "\n", "# Fix taxonomy split issue\n", "for index, row in enumerate(obj[\"rows\"]):\n", " row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split(\"; \")\n", "\n", "# Pretty Print JSON\n", "json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)\n", "\n", "# Create biom file\n", "biom_file = \"merged.biom\"\n", "print(\"Writing biom file to\", biom_file)\n", "print(json_formatted_str, file=open(biom_file, \"w\"))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Testing_nbs", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }