Source code for astir.data.data_readers

import os
import warnings
from typing import Any

import anndata
import loompy
import matplotlib.cbook
import numpy as np
import pandas as pd
import torch
import yaml
from sklearn.preprocessing import OneHotEncoder

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)


[docs]def from_csv_yaml( csv_input: str, marker_yaml: str, design_csv: str = None, create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from an expression CSV and marker YAML :param csv_input: Path to input csv containing expression for cells (rows) by proteins (columns). First column is cell identifier, and additional column names are gene identifiers. :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers. :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 """ df_gex = pd.read_csv(csv_input, index_col=0) design = None if design_csv is not None and create_design_mat == True: design = pd.read_csv(design_csv, index_col=0) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype=dtype)
[docs]def from_csv_dir_yaml( input_dir: str, marker_yaml: str, create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object a directory containing multiple csv files :param input_dir: Path to a directory containing multiple CSV files, each in the format expected by `from_csv_yaml` :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 """ # TODO: add text explaining concatenation # Parse the input directory csv_files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith("csv") ] # Read to gene expression df and parse dfs = [pd.read_csv(f, index_col=0) for f in csv_files] df_gex = pd.concat(dfs, axis=0) design = None if create_design_mat == True: # Construct a sample specific design matrix design_list = [np.repeat(str(i), dfs[i].shape[0]) for i in range(len(dfs))] design = ( OneHotEncoder() .fit_transform(np.concatenate(design_list, axis=0).reshape(-1, 1)) .todense() ) design = design[:, :-1] # remove final column design = np.concatenate( [np.ones((design.shape[0], 1)), design], axis=1 ) # add in intercept! with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_loompy_yaml( loom_file: str, marker_yaml: str, protein_name_attr: str = "protein", cell_name_attr: str = "cell_name", batch_name_attr: str = "batch", create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from a loom file and a marker yaml :param loom_file: Path to a loom file, where rows correspond to proteins and columns to cells :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name_attr: The attribute (key) in the row attributes that identifies the protein names (required to match with the marker gene information), defaults to protein :param cell_name_attr: The attribute (key) in the column attributes that identifies the name of each cell, defaults to cell_name :param batch_name_attr: The attribute (key) in the column attributes that identifies the batch. A design matrix will be built using this (if present) using a one-hot encoding to control for batch, defaults to batch :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 :return: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None with loompy.connect(loom_file) as ds: df_gex = pd.DataFrame(ds[:, :].T) df_gex.columns = ds.ra[protein_name_attr] if cell_name_attr in ds.ca.keys(): df_gex.index = ds.ca[cell_name_attr] if batch_name_attr in ds.ca.keys(): batch_list = ds.ca[batch_name_attr] design = None if batch_list is not None and create_design_mat == True: design = OneHotEncoder().fit_transform(batch_list.reshape(-1, 1)).todense() design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_anndata_yaml( anndata_file: str, marker_yaml: str, protein_name: str = None, cell_name: str = None, batch_name: str = "batch", create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from an :class:`anndata.Anndata` file and a marker yaml :param anndata_file: Path to an :class:`anndata.Anndata` `h5py` file :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name: The column of `adata.var` containing protein names. If this is none, defaults to `adata.var_names` :param cell_name: The column of `adata.obs` containing cell names. If this is none, defaults to `adata.obs_names` :param batch_name: The column of `adata.obs` containing batch names. A design matrix will be built using this (if present) using a one-hot encoding to control for batch, defaults to 'batch' :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 :return: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None ad = anndata.read_h5ad(anndata_file) df_gex = pd.DataFrame(ad.X) if protein_name is not None: df_gex.columns = ad.var[protein_name] else: df_gex.columns = ad.var_names if cell_name is not None: df_gex.index = ad.obs[cell_name] else: df_gex.index = ad.obs_names if batch_name is not None: batch_list = ad.obs[batch_name] design = None if batch_list is not None and create_design_mat == True: design = ( OneHotEncoder() .fit_transform(batch_list.to_numpy().reshape(-1, 1)) .todense() ) design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)