Source code for cmapPy.pandasGEXpress.GCToo

"""
DATA:
-----------------------------
|  |          cid           |
-----------------------------
|  |                        |
|r |                        |
|i |          data          |
|d |                        |
|  |                        |
-----------------------------
ROW METADATA:
--------------------------
|id|        rhd          |
--------------------------
|  |                     |
|r |                     |
|i |    row_metadata     |
|d |                     |
|  |                     |
--------------------------
COLUMN METADATA:
N.B. The df is transposed from how it looks in a gct file.
---------------------
|id|      chd       |
---------------------
|  |                |
|  |                |
|  |                |
|c |                |
|i |  col_metadata  |
|d |                |
|  |                |
|  |                |
|  |                |
---------------------

N.B. rids, cids, rhds, and chds must be:
- unique
- matching in both content & order everywhere they're found
"""
import numpy as np
import pandas as pd
import logging
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger


__authors__ = 'Oana Enache, Lev Litichevskiy, Dave Lahr'
__email__ = 'dlahr@broadinstitute.org'


[docs]class GCToo(object): """Class representing parsed gct(x) objects as pandas dataframes. Contains 3 component dataframes (row_metadata_df, column_metadata_df, and data_df) as well as an assembly of these 3 into a multi index df that provides an alternate way of selecting data. """ def __init__(self, data_df, row_metadata_df=None, col_metadata_df=None, src=None, version=None, make_multiindex=False, logger_name=setup_logger.LOGGER_NAME): self.logger = logging.getLogger(logger_name) self.src = src self.version = version # Check data_df before setting self.check_df(data_df) self.data_df = data_df if row_metadata_df is None: self.row_metadata_df = pd.DataFrame(index=data_df.index) else: # Lots of checks will occur when this attribute is set (see __setattr__ below) self.row_metadata_df = row_metadata_df if col_metadata_df is None: self.col_metadata_df = pd.DataFrame(index=data_df.columns) else: # Lots of checks will occur when this attribute is set (see __setattr__ below) self.col_metadata_df = col_metadata_df # Create multi_index_df if explicitly requested if make_multiindex: self.assemble_multi_index_df() else: self.multi_index_df = None # This GCToo object is now initialized self._initialized = True def __setattr__(self, name, value): # Make sure row/col metadata agree with data_df before setting if name in ["row_metadata_df", "col_metadata_df"]: self.check_df(value) if name == "row_metadata_df": self.id_match_check(self.data_df, value, "row") value = value.reindex(self.data_df.index) super(GCToo, self).__setattr__(name, value) else: self.id_match_check(self.data_df, value, "col") value = value.reindex(self.data_df.columns) super(GCToo, self).__setattr__(name, value) # When reassigning data_df after initialization, reindex row/col metadata if necessary # N.B. Need to check if _initialized is present before checking if it's true, or code will break elif name == "data_df" and "_initialized" in self.__dict__ and self._initialized: self.id_match_check(value, self.row_metadata_df, "row") self.id_match_check(value, self.col_metadata_df, "col") super(GCToo, self).__setattr__("row_metadata_df", self.row_metadata_df.reindex(value.index)) super(GCToo, self).__setattr__("col_metadata_df", self.col_metadata_df.reindex(value.columns)) super(GCToo, self).__setattr__(name, value) # Can't reassign multi_index_df after initialization elif name == "multi_index_df" and "_initialized" in self.__dict__ and self._initialized: msg = ("Cannot reassign value of multi_index_df attribute; " + "if you'd like a new multiindex df, please create a new GCToo instance" + "with appropriate data_df, row_metadata_df, and col_metadata_df fields.") self.logger.error(msg) raise Exception("GCToo.__setattr__: " + msg) # Otherwise, use the normal __setattr__ method else: super(GCToo, self).__setattr__(name, value) def check_df(self, df): """ Verifies that df is a pandas DataFrame instance and that its index and column values are unique. """ if isinstance(df, pd.DataFrame): if not df.index.is_unique: repeats = df.index[df.index.duplicated()].values msg = "Index values must be unique but aren't. The following entries appear more than once: {}".format(repeats) self.logger.error(msg) raise Exception("GCToo GCToo.check_df " + msg) if not df.columns.is_unique: repeats = df.columns[df.columns.duplicated()].values msg = "Columns values must be unique but aren't. The following entries appear more than once: {}".format(repeats) raise Exception("GCToo GCToo.check_df " + msg) else: return True else: msg = "expected Pandas DataFrame, got something else: {} of type: {}".format(df, type(df)) self.logger.error(msg) raise Exception("GCToo GCToo.check_df " + msg) def id_match_check(self, data_df, meta_df, dim): """ Verifies that id values match between: - row case: index of data_df & index of row metadata - col case: columns of data_df & index of column metadata """ if dim == "row": if len(data_df.index) == len(meta_df.index) and set(data_df.index) == set(meta_df.index): return True else: msg = ("The rids are inconsistent between data_df and row_metadata_df.\n" + "data_df.index.values:\n{}\nrow_metadata_df.index.values:\n{}").format(data_df.index.values, meta_df.index.values) self.logger.error(msg) raise Exception("GCToo GCToo.id_match_check " + msg) elif dim == "col": if len(data_df.columns) == len(meta_df.index) and set(data_df.columns) == set(meta_df.index): return True else: msg = ("The cids are inconsistent between data_df and col_metadata_df.\n" + "data_df.columns.values:\n{}\ncol_metadata_df.index.values:\n{}").format(data_df.columns.values, meta_df.index.values) self.logger.error(msg) raise Exception("GCToo GCToo.id_match_check " + msg) def __str__(self): """Prints a string representation of a GCToo object.""" version = "{}\n".format(self.version) source = "src: {}\n".format(self.src) data = "data_df: [{} rows x {} columns]\n".format( self.data_df.shape[0], self.data_df.shape[1]) row_meta = "row_metadata_df: [{} rows x {} columns]\n".format( self.row_metadata_df.shape[0], self.row_metadata_df.shape[1]) col_meta = "col_metadata_df: [{} rows x {} columns]".format( self.col_metadata_df.shape[0], self.col_metadata_df.shape[1]) full_string = (version + source + data + row_meta + col_meta) return full_string def assemble_multi_index_df(self): """Assembles three component dataframes into a multiindex dataframe. Sets the result to self.multi_index_df. IMPORTANT: Cross-section ("xs") is the best command for selecting data. Be sure to use the flag "drop_level=False" with this command, or else the dataframe that is returned will not have the same metadata as the input. N.B. "level" means metadata header. N.B. "axis=1" indicates column annotations. Examples: 1) Select the probe with pr_lua_id="LUA-3404": lua3404_df = multi_index_df.xs("LUA-3404", level="pr_lua_id", drop_level=False) 2) Select all DMSO samples: DMSO_df = multi_index_df.xs("DMSO", level="pert_iname", axis=1, drop_level=False) """ #prepare row index self.logger.debug("Row metadata shape: {}".format(self.row_metadata_df.shape)) self.logger.debug("Is empty? {}".format(self.row_metadata_df.empty)) row_copy = pd.DataFrame(self.row_metadata_df.index) if self.row_metadata_df.empty else self.row_metadata_df.copy() row_copy["rid"] = row_copy.index row_index = pd.MultiIndex.from_arrays(row_copy.T.values, names=row_copy.columns) #prepare column index self.logger.debug("Col metadata shape: {}".format(self.col_metadata_df.shape)) col_copy = pd.DataFrame(self.col_metadata_df.index) if self.col_metadata_df.empty else self.col_metadata_df.copy() col_copy["cid"] = col_copy.index transposed_col_metadata = col_copy.T col_index = pd.MultiIndex.from_arrays(transposed_col_metadata.values, names=transposed_col_metadata.index) # Create multi index dataframe using the values of data_df and the indexes created above self.logger.debug("Data df shape: {}".format(self.data_df.shape)) self.multi_index_df = pd.DataFrame(data=self.data_df.values, index=row_index, columns=col_index)
def multi_index_df_to_component_dfs(multi_index_df, rid="rid", cid="cid"): """ Convert a multi-index df into 3 component dfs. """ # Id level of the multiindex will become the index rids = list(multi_index_df.index.get_level_values(rid)) cids = list(multi_index_df.columns.get_level_values(cid)) # It's possible that the index and/or columns of multi_index_df are not # actually multi-index; need to check for this if isinstance(multi_index_df.index, pd.core.index.MultiIndex): # If so, drop rid because it won't go into the body of the metadata mi_df_index = multi_index_df.index.droplevel(rid) # Names of the multiindex levels become the headers rhds = list(mi_df_index.names) # Assemble metadata values row_metadata = np.array([mi_df_index.get_level_values(level).values for level in list(rhds)]).T # If the index is not multi-index, then rhds and row metadata should be empty else: rhds = [] row_metadata = [] # Check if columns of multi_index_df are in fact multi-index if isinstance(multi_index_df.columns, pd.core.index.MultiIndex): # If so, drop cid because it won't go into the body of the metadata mi_df_columns = multi_index_df.columns.droplevel(cid) # Names of the multiindex levels become the headers chds = list(mi_df_columns.names) # Assemble metadata values col_metadata = np.array([mi_df_columns.get_level_values(level).values for level in list(chds)]).T # If the columns are not multi-index, then rhds and row metadata should be empty else: chds = [] col_metadata = [] # Create component dfs row_metadata_df = pd.DataFrame.from_records(row_metadata, index=pd.Index(rids, name="rid"), columns=pd.Index(rhds, name="rhd")) col_metadata_df = pd.DataFrame.from_records(col_metadata, index=pd.Index(cids, name="cid"), columns=pd.Index(chds, name="chd")) data_df = pd.DataFrame(multi_index_df.values, index=pd.Index(rids, name="rid"), columns=pd.Index(cids, name="cid")) return data_df, row_metadata_df, col_metadata_df