{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Amplicon Mimosa\n", "\n", "This document contains many functions for all kinds of amplicon analysis used at the **UNLOCK FAIR data platform**.\n", "\n", "```{note}\n", "Please note that this document is a work in progress.\n", "```\n", "\n", "## MIMOSA Data retrieval using iBridges\n", "\n", "This page assumes you have already installed iBridges and created an irods json file. Check the [iBridges python client page](../storage/ibridges_python.ipynb) for the latest information on how to install, authenticate, and retrieve data from iBridges.\n", "\n", "First, fill in your SRAM username and token:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from ibridges import Session\n", "from getpass import getpass\n", "\n", "# Set iRODS environment directory and environment file\n", "irods_env_dir = \"~/.irods\"\n", "irods_env_file = \"irods_environment.json\"\n", "\n", "# Provide your SRAM username\n", "if not 'username' in locals():\n", " username = input(\"Enter your SRAM username: \")\n", "\n", "env_loc = Path(irods_env_dir) / irods_env_file\n", "env_file = Path.expanduser(Path(env_loc))\n", "\n", "if not 'password' in locals():\n", " password = getpass(\"Enter your SRAM password: \")\n", "\n", "try:\n", " session = Session(env_file, password=password)\n", "except Exception as e:\n", " print(\"Failed to establish session:\", e)\n", " del password" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set the INVESTIGATION and STUDY variables to the corresponding names of your Investigation and Study.\n", "\n", "*This will download All NG-Tax TURTLE data from the iRODS instance* " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ibridges import IrodsPath, search_data, download\n", "from pathlib import Path\n", "import os\n", "\n", "# Set investigation and study variables. Investigation name corresponds to the folder located at /unlock/home/wur. and study to a subfolder /stu_\n", "investigation = \"SET_INVESTIGATION\" \n", "study = \"SET_STUDY\"\n", "\n", "# Define where to download files locally\n", "download_path = \"./unlock_downloads/\"+investigation+\"/\"+study \n", "\n", "# Create the directory if it doesn't exist yet\n", "download_dir = Path.expanduser(Path(download_path))\n", "download_dir.mkdir(parents=True, exist_ok=True )\n", "\n", "# Define specified file pattern (e.g. ttl files)\n", "pattern=\"%.ttl\"\n", "# Define search path\n", "search = f\"/unlock/home/wur.{investigation}/stu_{study}/\"\n", "data = search_data(session, path=IrodsPath(session, search), path_pattern=pattern)\n", "\n", "# Set options and counters\n", "overwrite = False\n", "downloaded, skipped = 0,0\n", "unique_folders = []\n", "\n", "# Iterate over search results and download files\n", "for item in data:\n", " path = str(item.absolute()) # Convert path to string\n", " local_destination = Path(download_path) # Base local directory\n", " file_destination = local_destination / item.name # Construct local file path\n", "\n", " if item.dataobject_exists(): # Check if file\n", " local_destination.mkdir(parents=True, exist_ok=True)\n", " if not file_destination.exists() or overwrite: # Download only if not present\n", " download(session, item, file_destination, overwrite=overwrite)\n", " downloaded += 1\n", " else:\n", " skipped += 1\n", "\n", "# Print download summary\n", "print(\"\\nDownloaded: \", downloaded)\n", "print(\"Skipped: \", skipped)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def create_repository(label):\n", " import requests\n", " # RDF4J Repository config file\n", " content = \"\"\"\n", "#\n", "# RDF4J configuration template for a GraphDB repository\n", "#\n", "@prefix rdfs: .\n", "@prefix rep: .\n", "@prefix sr: .\n", "@prefix sail: .\n", "@prefix graphdb: .\n", "\n", "[] a rep:Repository ;\n", " rep:repositoryID \"XXXX\" ;\n", " rdfs:label \"\" ;\n", " rep:repositoryImpl [\n", " rep:repositoryType \"graphdb:SailRepository\" ;\n", " sr:sailImpl [\n", " sail:sailType \"graphdb:Sail\" ;\n", "\n", " graphdb:read-only \"false\" ;\n", "\n", " # Inference and Validation\n", " graphdb:ruleset \"rdfsplus-optimized\" ;\n", " graphdb:disable-sameAs \"true\" ;\n", " graphdb:check-for-inconsistencies \"false\" ;\n", "\n", " # Indexing\n", " graphdb:entity-id-size \"32\" ;\n", " graphdb:enable-context-index \"false\" ;\n", " graphdb:enablePredicateList \"true\" ;\n", " graphdb:enable-fts-index \"false\" ;\n", " graphdb:fts-indexes (\"default\" \"iri\") ;\n", " graphdb:fts-string-literals-index \"default\" ;\n", " graphdb:fts-iris-index \"none\" ;\n", "\n", " # Queries and Updates\n", " graphdb:query-timeout \"0\" ;\n", " graphdb:throw-QueryEvaluationException-on-timeout \"false\" ;\n", " graphdb:query-limit-results \"0\" ;\n", "\n", " # Settable in the file but otherwise hidden in the UI and in the RDF4J console\n", " graphdb:base-URL \"http://example.org/owlim#\" ;\n", " graphdb:defaultNS \"\" ;\n", " graphdb:imports \"\" ;\n", " graphdb:repository-type \"file-repository\" ;\n", " graphdb:storage-folder \"storage\" ;\n", " graphdb:entity-index-size \"10000000\" ;\n", " graphdb:in-memory-literal-properties \"true\" ;\n", " graphdb:enable-literal-index \"true\" ;\n", " ]\n", " ].\n", " \"\"\"\n", " content = content.replace(\"XXXX\", label)\n", " writer = open(\"config.ttl\",\"w\")\n", " print(content, file=writer)\n", " writer.close()\n", "\n", " headers = {\n", " # requests won't add a boundary if this header is set when you pass files=\n", " # 'Content-Type': 'multipart/form-data',\n", " }\n", "\n", " files = {\n", " 'config': open('./config.ttl', 'rb'),\n", " }\n", "\n", " response = requests.post('http://localhost:7200/rest/repositories', headers=headers, files=files)\n", " print(response.__dict__)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Process the turtle files\n", "from rdflib import Graph\n", "import requests, json\n", "import time \n", "\n", "# Check if GraphDB runs:...\n", "# docker volume create graphdb\n", "# docker run -v graphdb:/opt/graphdb/dist/data --name graphdb -p 7200:7200 ontotext/graphdb:10.1.3 -Xmx6g\n", "graphdb = 'http://localhost:7200'\n", "\n", "# GraphDB restrictions\n", "REPO = investigation.replace(\".\", \"_\")\n", "\n", "def check_graphdb():\n", " headers = {\n", " 'Accept': 'application/json',\n", " }\n", "\n", " response = requests.get(graphdb + '/rest/repositories', headers=headers)\n", " content = json.loads(response.content.decode())\n", " repo_available = False\n", " print(\"Available repositories:\")\n", " for repository in content:\n", " if repository['id'] == REPO:\n", " print(repository['id'])\n", " repo_available = True\n", " if not repo_available:\n", " # Create repository\n", " create_repository(REPO)\n", "\n", "def load_data(data_file):\n", " headers = {\n", " 'Content-Type': 'application/x-turtle',\n", " 'Accept': 'application/json'\n", " }\n", "\n", " with open(data_file, 'rb') as f:\n", " response = requests.post(graphdb + '/repositories/'+REPO+'/statements', data=f, headers=headers)\n", " if \"Unknown repository:\" in response.content.decode():\n", " raise Exception(response.content.decode())\n", "\n", "###\n", "to_be_loaded = set()\n", "check_graphdb()\n", "\n", "search_path = f\"./unlock_downloads/{investigation}/{study}\"\n", "for current_dir, subdirs, files in os.walk(search_path):\n", " for file in files:\n", " if (file.endswith(\".ttl\")) and investigation in current_dir:\n", " to_be_loaded.add(current_dir + \"/\" + file)\n", "\n", "if len(to_be_loaded) == 0:\n", " print(\"No files to be loaded\")\n", "\n", "for index, file_path in enumerate(to_be_loaded):\n", " current = time.time()\n", " load_data(file_path)\n", " end = time.time()\n", " diff = int(end - current)\n", " print(\"Loaded file \",index, \"of\", len(to_be_loaded),\"in\", diff, \"seconds with\",file_path.split(\"/\")[-1], \" \"*10, end=\"\\r\")\n", " " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### MIMOSA table generator" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "from SPARQLWrapper import SPARQLWrapper, JSON\n", "# Make connection with sparql endpoint containing all NG-Tax 2.0 data\n", "sparql = SPARQLWrapper(\"http://localhost:7200/repositories/\" + REPO)\n", "sparql.setReturnFormat(JSON)\n", "\n", "# Get all sample name mappings\n", "# Query for column headers to sample name\n", "query = \"\"\"\n", "PREFIX gbol: \n", "PREFIX schema: \n", "PREFIX jerm: \n", "SELECT DISTINCT ?sample_name ?sample_id WHERE { \n", "\t?ngtax_sample gbol:name ?sample_name .\n", " ?ngtax_sample gbol:metadata ?mappingFile .\n", " ?mappingFile a gbol:MappingFile .\n", " ?mappingFile gbol:fastQforward ?fwd .\n", " ?mappingFile gbol:fastQreverse ?rev .\n", " ?sample jerm:hasPart/jerm:hasPart/schema:identifier ?fwd .\n", " ?sample jerm:hasPart/jerm:hasPart/schema:identifier ?rev .\n", " ?sample a jerm:Sample .\n", " ?sample schema:identifier ?sample_id .\n", "}\"\"\"\n", "\n", "sparql.setQuery(query)\n", "ret = sparql.queryAndConvert()\n", "# Iterate over the sample names\n", "sample_name_lookup = {}\n", "for index, r in enumerate(ret[\"results\"][\"bindings\"]):\n", " sample_name_lookup[r['sample_name']['value']] = r['sample_id']['value']\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from SPARQLWrapper import SPARQLWrapper, JSON\n", "# Make connection with sparql endpoint containing all NG-Tax 2.0 data\n", "sparql = SPARQLWrapper(\"http://localhost:7200/repositories/\" + REPO)\n", "sparql.setReturnFormat(JSON)\n", "\n", "# Obtain all samples to make it more scalable\n", "query = \"\"\"\n", "PREFIX gbol:\n", "SELECT DISTINCT ?sample\n", "WHERE {\n", " ?sample a gbol:Sample .\n", "}\n", "\"\"\"\n", "\n", "sparql.setQuery(query)\n", "ret = sparql.queryAndConvert()\n", "# Create containers for the forward, reverse ASV and the sample list\n", "container_fseq = {}\n", "container_rseq = {}\n", "all_samples = set()\n", "# Iterate over the samples\n", "for index, r in enumerate(ret[\"results\"][\"bindings\"]):\n", " # For each sample get all ASVs and abundance information\n", " sample = r['sample']['value']\n", " print(\"Processed sample number\", index, end='\\r')\n", " query = \"\"\"\n", " PREFIX gbol:\n", " SELECT DISTINCT ?name ?fseq ?rseq ?count\n", " WHERE {\n", " VALUES ?sample { <\"\"\"+sample+\"\"\"> }\n", " ?sample a gbol:Sample .\n", " ?sample gbol:name ?name .\n", " ?sample gbol:asv ?asv .\n", " ?asv a gbol:ASVSet . \n", " ?asv gbol:forwardASV ?fasv .\n", " ?fasv gbol:sequence ?fseq .\n", " ?asv gbol:reverseASV ?rasv . \n", " ?rasv gbol:sequence ?rseq .\n", " ?asv gbol:clusteredReadCount ?count .\n", " }\"\"\"\n", " # Set query and iterate over results\n", " sparql.setQuery(query)\n", " ret = sparql.queryAndConvert()\n", " for r in ret[\"results\"][\"bindings\"]:\n", " raw_name = r['name']['value']\n", " # look up, but if missing just use the raw name\n", " name = sample_name_lookup.get(raw_name, raw_name)\n", " fseq = r['fseq']['value']\n", " rseq = r['rseq']['value']\n", " count = int(r['count']['value'])\n", " all_samples.add(name)\n", " # Fseq object fill method\n", " if fseq not in container_fseq:\n", " container_fseq[fseq] = {}\n", " if name not in container_fseq[fseq]:\n", " container_fseq[fseq][name] = 0\n", " container_fseq[fseq][name] += count\n", "\n", " # Rseq object fill method\n", " if rseq not in container_rseq:\n", " container_rseq[rseq] = {}\n", " if name not in container_rseq[rseq]:\n", " container_rseq[rseq][name] = 0\n", " container_rseq[rseq][name] += count" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "# Create an abundance matrix using numpy\n", "def create_abundance_matrix(name, container, all_samples):\n", " matrix = np.zeros((len(container), len(all_samples)), dtype=int)\n", " all_samples = sorted(list(all_samples))\n", " asvs = sorted(container.keys())\n", " for row_index, asv in enumerate(asvs):\n", " for sample in container[asv]:\n", " coll_index = all_samples.index(sample)\n", " value = container[asv][sample]\n", " matrix[row_index,coll_index] = int(value)\n", "\n", " # Write to file as panda frame with column and row names\n", " df = pd.DataFrame(matrix, columns=all_samples, index=asvs)\n", " df.index.name=\"OTU\"\n", " df.to_csv(name, sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# Creating the RSEQ tsv file\n", "create_abundance_matrix(\"fseq.tsv\", container_fseq, all_samples)\n", "# Creating the FSEQ tsv file\n", "create_abundance_matrix(\"rseq.tsv\", container_rseq, all_samples)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# First three lines of the file\n", "with open(\"fseq.tsv\") as f:\n", " print(f.readline(), end=\"\")\n", " print(f.readline(), end=\"\")\n", " print(f.readline(), end=\"\")" ] } ], "metadata": { "kernelspec": { "display_name": "Testing_nbs", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }