Source code for rnanorm.datasets
"""Datasets."""
from pathlib import Path
import pandas as pd
from sklearn.utils import Bunch
FILES_DIR = Path(__file__).parent / "files"
[docs]
def load_toy_data() -> Bunch:
"""
Load a minimal, made-up raw count RNA-seq dataset.
This dataset is not representative of any real RNA-seq dataset. However,
it is small and has just enough complexity to showcase the effects of
methods. It is also simple enough to make calculation by hand possible.
.. rubric:: Examples
>>> from rnanorm.datasets import load_toy_data
>>> dataset = load_toy_data()
>>> dataset.exp
Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
Sample_1 200 300 500 2000 7000
Sample_2 400 600 1000 4000 14000
Sample_3 200 300 500 2000 17000
Sample_4 200 300 500 2000 2000
>>> # TPM and FPKM normalization also require GTF file
>>> dataset.gtf_path
PosixPath('/Users/me/.../toy.gtf')
"""
ds = Bunch()
ds.exp = pd.read_csv(FILES_DIR / "toy_exp.csv", index_col=0)
ds.gtf_path = FILES_DIR / "toy.gtf"
return ds
[docs]
def load_gtex() -> Bunch:
"""
Load a real RNA-seq dataset from GTFx project.
Dataset is sampled to contain just chr21 and first 30 samples from GTEx
lung V8.
.. rubric:: Examples
>>> from rnanorm.datasets import load_gtex
>>> dataset = load_gtex()
>>> dataset.exp
ENSG00000141956.13 ENSG00000141959.16 ...
GTEX-111CU-0326-SM-5GZXO 871 8129 ...
GTEX-111FC-1126-SM-5GZWU 852 7076 ...
GTEX-111VG-0726-SM-5GIDC 912 11016 ...
... ... ... ...
>>> # TPM and FPKM normalization also require GTF file
>>> dataset.gtf_path
PosixPath('/Users/.../gencode.v26.annotation.chr21.gtf.gz')
"""
ds = Bunch()
ds.exp = pd.read_csv(FILES_DIR / "gtex_lung.first30.chr21.csv.gz", index_col=0)
ds.gtf_path = FILES_DIR / "gencode.v26.annotation.chr21.gtf.gz"
return ds