Source code for rnanorm.datasets

"""Datasets."""

from pathlib import Path

import pandas as pd
from sklearn.utils import Bunch

FILES_DIR = Path(__file__).parent / "files"



[docs]
def load_toy_data() -> Bunch:
    """
    Load a minimal, made-up raw count RNA-seq dataset.

    This dataset is not representative of any real RNA-seq dataset. However,
    it is small and has just enough complexity to showcase the effects of
    methods. It is also simple enough to make calculation by hand possible.

    .. rubric:: Examples

    >>> from rnanorm.datasets import load_toy_data
    >>> dataset = load_toy_data()
    >>> dataset.exp
              Gene_1  Gene_2  Gene_3  Gene_4  Gene_5
    Sample_1     200     300     500    2000    7000
    Sample_2     400     600    1000    4000   14000
    Sample_3     200     300     500    2000   17000
    Sample_4     200     300     500    2000    2000
    >>> # TPM and FPKM normalization also require GTF file
    >>> dataset.gtf_path
    PosixPath('/Users/me/.../toy.gtf')

    """
    ds = Bunch()
    ds.exp = pd.read_csv(FILES_DIR / "toy_exp.csv", index_col=0)
    ds.gtf_path = FILES_DIR / "toy.gtf"

    return ds




[docs]
def load_gtex() -> Bunch:
    """
    Load a real RNA-seq dataset from GTFx project.

    Dataset is sampled to contain just chr21 and first 30 samples from GTEx
    lung V8.

    .. rubric:: Examples

    >>> from rnanorm.datasets import load_gtex
    >>> dataset = load_gtex()
    >>> dataset.exp
                              ENSG00000141956.13  ENSG00000141959.16   ...
    GTEX-111CU-0326-SM-5GZXO                 871                8129   ...
    GTEX-111FC-1126-SM-5GZWU                 852                7076   ...
    GTEX-111VG-0726-SM-5GIDC                 912               11016   ...
    ...                                      ...                 ...   ...
    >>> # TPM and FPKM normalization also require GTF file
    >>> dataset.gtf_path
    PosixPath('/Users/.../gencode.v26.annotation.chr21.gtf.gz')

    """
    ds = Bunch()
    ds.exp = pd.read_csv(FILES_DIR / "gtex_lung.first30.chr21.csv.gz", index_col=0)
    ds.gtf_path = FILES_DIR / "gencode.v26.annotation.chr21.gtf.gz"

    return ds
Source code for rnanorm.datasets

RNA-seq normalization

Navigation

Related Topics