{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Amplicon retrieval\n",
    "\n",
    "This document describes how to obtain multiple processed amplicon datasets from individual samples. After retrieval of the dataset files it will convert this into a single object to be used for downstream analysis.\n",
    "\n",
    "## Data retrieval using iBridges\n",
    "\n",
    "This page assumes you have already installed iBridges and created an iRODS configuration file. Check the [iBridges python client page](../storage/ibridges_python.ipynb) for the latest information on how to install, authenticate, and retrieve data from iBridges.\n",
    "\n",
    "### Amplicon data retrieval\n",
    "Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install ibridges ipywidgets pandas biom-format --quiet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Authentication and download of ASV / SEQ / MET / TAX files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d951d12b970843fc98deae981f37f122",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from ibridges.interactive import interactive_auth\n",
    "from ibridges import search_data\n",
    "from ibridges import download\n",
    "\n",
    "import concurrent.futures\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "data_files = set()\n",
    "\n",
    "def download_item(session, item):\n",
    "    download(session, item, \"data/\", overwrite=True)\n",
    "    data_files.add(\"data/\" + item.name)\n",
    "\n",
    "def parallel_download(session, results, max_workers=10):\n",
    "    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "        # Wrap the results with tqdm for progress tracking\n",
    "        list(tqdm(executor.map(lambda item: download_item(session, item), results), total=len(results), desc=\"Downloading\"))\n",
    "\n",
    "# Example usage:\n",
    "session = interactive_auth()\n",
    "\n",
    "# Searching for files in the iRODS collection\n",
    "results = search_data(session, path_pattern=\"%project/%benchmark%/%_asv.tsv\")\n",
    "results += search_data(session, path_pattern=\"%project/%benchmark%/%_seq.tsv\")\n",
    "results += search_data(session, path_pattern=\"%project/%benchmark%/%_met.tsv\")\n",
    "results += search_data(session,path_pattern=\"%project/%benchmark%/%_tax.tsv\")\n",
    "# Start the download process\n",
    "parallel_download(session, results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combining the ASV, SEQ, MET and TAX files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing  197\r"
     ]
    }
   ],
   "source": [
    "destinations = {\n",
    "    \"seq.tsv\" : open(\"seq.tsv\", \"w\"), \n",
    "    \"asv.tsv\" : open(\"asv.tsv\", \"w\"),\n",
    "    \"tax.tsv\" : open(\"tax.tsv\", \"w\"),\n",
    "    \"met.tsv\" : open(\"met.tsv\", \"w\")\n",
    "}\n",
    "\n",
    "# Merge all tsv files into the appropiate file object\n",
    "for index, data_file in enumerate(data_files):\n",
    "    print(\"Processing \", index, end=\"\\r\")\n",
    "    id = data_file.split(\"_\")[-1]\n",
    "    content = open(\"./\" + data_file).read()\n",
    "    print(content, file=destinations[id])\n",
    "\n",
    "# Close all destination files\n",
    "for destination in destinations:\n",
    "    destinations[destination].close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cleaning the taxonomy table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                                                             taxonomy\n",
      "ID                                                                                                   \n",
      "0253409fd06cacc6d77fbea3daf7904b4a849884ac933ca...  k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...\n",
      "02a5d30774a4aa2bb68b37ad053084e4a8756498945c258...  k__Bacteria; p__Thermodesulfobacteriota; c__De...\n",
      "032dc4f2f5b3511282069f4220eae7e27135aee75489b41...  k__Bacteria; p__Bacillota; c__Bacilli; o__Lact...\n",
      "0612b5106089c3769793ee4294d675a3f6a82c090708710...  k__Bacteria; p__Bacillota; c__Bacilli; o__Erys...\n",
      "06397b16f8c1c32a87d8aa93c0447deccbd6cdc06a9868e...  k__Bacteria; p__Bacillota; c__Clostridia; o__L...\n"
     ]
    }
   ],
   "source": [
    "# Process taxonomy\n",
    "import pandas as pd\n",
    "\n",
    "lines = []\n",
    "for line in open(\"tax.tsv\"):\n",
    "    line = line.strip().split(\"\\t\")\n",
    "    if len(line) < 8:\n",
    "        line = line[:8] + [\"NA\"]*(8 - len(line))\n",
    "    formatted = [line[0], \"k__\" + line[1]+ \"; p__\" + line[2]+ \"; c__\" + line[3]+ \"; o__\" + line[4]+ \"; f__\" + line[5]+ \"; g__\" + line[6]+ \"; s__\" + line[7]]\n",
    "    lines.append(formatted)\n",
    "tax_df = pd.DataFrame(lines)\n",
    "tax_df.columns = [\"ID\", \"taxonomy\"]\n",
    "tax_df = tax_df.set_index(\"ID\")\n",
    "\n",
    "print(tax_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transforming the files into matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50000 57861\r"
     ]
    }
   ],
   "source": [
    "# Load and transform the ASV file into a matrix\n",
    "lines = []\n",
    "size = {\"nRow\": set(), \"nCol\": set()}\n",
    "# Read the asv.tsv file\n",
    "for line in open(\"asv.tsv\"):\n",
    "    line = line.strip().split(\"\\t\")\n",
    "    # Skip header\n",
    "    if len(line) == 1: continue\n",
    "    # Add to lines\n",
    "    lines.append(line)\n",
    "    # Row sample\n",
    "    size[\"nRow\"].add(line[1])\n",
    "    # Column ASV\n",
    "    size[\"nCol\"].add(line[0])\n",
    "\n",
    "# Three columns to matrix method    \n",
    "asv_df = pd.DataFrame(index=range(len(size[\"nRow\"])),columns=range(len(size[\"nCol\"])))\n",
    "asv_df.columns = size[\"nCol\"]\n",
    "\n",
    "# Sort the index\n",
    "asv_index = sorted(list(size[\"nRow\"]))\n",
    "asv_df.index = asv_index\n",
    "\n",
    "# Fill the matrix with the values from the lines\n",
    "for index, line in enumerate(lines):\n",
    "    if index % 10000 == 0:\n",
    "        print(index, len(lines), end=\"\\r\")\n",
    "    # Clean the sample, asv and value\n",
    "    sample, asv, value = line\n",
    "    sample = sample.strip()\n",
    "    asv = asv.strip()\n",
    "    value = value.strip()\n",
    "    # Set the value to the matrix using the sample and asv as index\n",
    "    asv_df.loc[asv, sample]=value"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combining the matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/8z/90xkbc014w55jjnpdb62xg2m0000gn/T/ipykernel_5284/2224568574.py:6: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  result.fillna(0, inplace=True)\n"
     ]
    }
   ],
   "source": [
    "# Merge the ASV and taxonomy dataframes\n",
    "result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)\n",
    "# Remove duplicates\n",
    "result = result.drop_duplicates(keep=\"first\")\n",
    "# Fill NaN with 0\n",
    "result.fillna(0, inplace=True)\n",
    "\n",
    "# Rename the rownames\n",
    "row_names = result.index.values\n",
    "# Replace the rownames with ASV_1, ASV_2, ...\n",
    "for index, value in enumerate(row_names):\n",
    "    row_names[index] = \"ASV_\" + str(index + 1)\n",
    "result.index = row_names\n",
    "\n",
    "# Write to file\n",
    "result.to_csv(\"merged.tsv\", sep=\"\\t\")\n",
    "\n",
    "# Load and fix first line\n",
    "lines = open(\"merged.tsv\").readlines()\n",
    "# Add header\n",
    "lines.insert(0, \"# Automatically generated input file for biom conversion\")\n",
    "# Add OTU ID header\n",
    "lines[1] = \"#OTU ID\" + lines[1]\n",
    "# Write to file\n",
    "output = open(\"merged.tsv\", \"w\")\n",
    "for line in lines:\n",
    "    print(line.strip(), file=output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combining the matrices and saving the BIOM file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing biom file to merged.biom\n"
     ]
    }
   ],
   "source": [
    "# Load biom and json\n",
    "from biom import load_table\n",
    "import json\n",
    "\n",
    "# Convert merged.tsv to biom file\n",
    "biom_content = load_table(\"merged.tsv\")\n",
    "\n",
    "# Load metadata file\n",
    "sample_ids = set(biom_content._sample_ids)\n",
    "metadata = {}\n",
    "keys = set()\n",
    "\n",
    "# Read the met.tsv file\n",
    "for index, line in enumerate(open(\"met.tsv\")):\n",
    "    line = line.strip()\n",
    "    # Skip empty lines\n",
    "    if \"\\t\" not in line: continue\n",
    "    identifier, key, value = line.split(\"\\t\")\n",
    "    # Bug in biom reader, all metadata need to have the same keys in the same order\n",
    "    keys.add(key)\n",
    "    # Skip lines that are not in this biom object\n",
    "    if identifier not in sample_ids: continue\n",
    "    if identifier not in metadata:\n",
    "        metadata[identifier] = {}\n",
    "    metadata[identifier][key] = str(value)\n",
    "\n",
    "# Add missing keys to metadata using a sorted list\n",
    "keys = sorted(keys)\n",
    "# Add missing keys to metadata\n",
    "for identifier in metadata:\n",
    "    for key in keys:\n",
    "        if key not in metadata[identifier]:\n",
    "            metadata[identifier][key] = \"None\"\n",
    "\n",
    "# Add metadata\n",
    "biom_content.add_metadata(metadata)\n",
    "# Set biom type\n",
    "biom_content.type = \"OTU table\"\n",
    "json_data = biom_content.to_json(generated_by=\"UNLOCK conversion module\")\n",
    "\n",
    "# Create Python object from JSON string data\n",
    "obj = json.loads(json_data)\n",
    "\n",
    "# Fix taxonomy split issue\n",
    "for index, row in enumerate(obj[\"rows\"]):\n",
    "    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split(\"; \")\n",
    "\n",
    "# Pretty Print JSON\n",
    "json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)\n",
    "\n",
    "# Create biom file\n",
    "biom_file = \"merged.biom\"\n",
    "print(\"Writing biom file to\", biom_file)\n",
    "print(json_formatted_str, file=open(biom_file, \"w\"))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Testing_nbs",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}