Source code for edgePy.data_import.mongodb.gene_functions

"""The core Python code for generating data."""
from typing import Dict, Optional, List, Tuple, Any, Hashable


[docs]def get_genelist_from_file(filename: str) -> Optional[List]: """ Converts a genelist file into a list of genes. Simple function, but can be expanded if needed. Args: filename: gene list file name. """ # TODO: should be expanded to handle gzip genelists too. if not filename: return None gene_list = [] with open(filename, "r") as file_handle: for line in file_handle: gene_list.append(line.strip()) return gene_list
[docs]def translate_genes( genes: Optional[List[str]], mongo_reader: Any, database: str = "ensembl_90_37" ) -> Tuple[List[str], Dict[str, str]]: """ Functions to translate a list of genes in to ENGS symbols and vice versa. Args: genes: list of genes to filter on. mongo_reader: the mongo connector database: the name of the database to use. "pytest" for unit testimg (mocking) Returns: a list of ensg symbols, a list of gene symbols """ ensg_genes = [] non_ensg_genes = [] gene_symbols = {} query: Dict[Hashable, Any] = {} if genes: for gene in genes: if gene.startswith("ENSG"): ensg_genes.append(gene) else: non_ensg_genes.append(gene) if ensg_genes or not genes: if genes: query["_id"] = {"$in": ensg_genes} symbol_gene_list = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query=query) for symbol_gene in symbol_gene_list: for symbol in symbol_gene["symbols"]: gene_symbols[symbol_gene["_id"]] = symbol for ensg in ensg_genes: if ensg not in gene_symbols: gene_symbols[ensg] = ensg if non_ensg_genes or not genes: query = {"_id": {"$in": non_ensg_genes}} if genes else {} translated_gene_list = mongo_reader.find_as_cursor(database, "ensg_by_symbol", query=query) for trans_gene in translated_gene_list: symbol = trans_gene["_id"] ensgs = trans_gene["ensgs"] for ensg in ensgs: gene_symbols[ensg] = symbol ensg_genes.append(ensg) return ensg_genes, gene_symbols
[docs]def get_gene_list(mongo_reader: Any, database: str = "ensembl_90_37") -> Dict[str, str]: """ get the list of genes from the mongo database, to translated ensg ids to symbols. Args: mongo_reader: the mongo wrapper database: database name to use. """ genes = mongo_reader.find_as_cursor(database, "symbol_by_ensg", query={}) gene_symbols = {} for symbol_gene in genes: for symbol in symbol_gene["symbols"]: gene_symbols[symbol_gene["_id"]] = symbol return gene_symbols
[docs]def get_sample_details( group_by: str, mongo_reader: Any, database: str ) -> Dict[Any, Dict[str, Any]]: """ Get details from the samples collection. Use this to decide which samples to query data for. Args: group_by: the name of the key to group samples by (Category-based key) mongo_reader: the mongo wrapper database: the database to use Returns: details required for each sample available. """ sample_details = {} search = {group_by: {"$exists": True}} sample_grouping = mongo_reader.find_as_cursor( database, "samples", query=search, projection={"_id": 0, group_by: 1, "sample_name": 1, "Description": 1}, ) for sample in sample_grouping: sample_details[sample["sample_name"]] = { "description": sample["Description"] if "Description" in sample else sample["sample_name"], "category": sample[group_by], } return sample_details
[docs]def get_canonical_rpkm(result: Dict[str, Any]) -> Optional[int]: """ Get the rpkm from the database for a given entry in the data collection. Args: result: the entry in the data collection Returns: the rpkm value """ transcript_list = result["transcripts"] for trans in transcript_list.values(): if int(trans["canonical"]) == 1: return trans["rpkm"] return None
[docs]def get_canonical_raw(result: Dict[str, Any]) -> Optional[int]: """ An approximation of the raw count of reads. Args: result: the entry from the data collection Returns: the raw count (as an integer) """ transcript_list = result["transcripts"] for trans in transcript_list.values(): if int(trans["canonical"]) == 1: raw = 0 for exon in trans["exons"]: raw += int(trans["exons"][exon]["raw"]) return raw return None