Source code for pandas_plink._read_rel

from ._filetype import file_type


[docs]def read_rel(filepath, id_filepath=None): """ Read PLINK realized relationship matrix files [b]_. It supports plain text, binary, and compressed files. The usual file extensions for those types are `.rel`, `.rel.bin`, and `.rel.zst`, respectively. Example ------- .. doctest:: >>> from os.path import join >>> from pandas_plink import read_rel >>> from pandas_plink import get_data_folder >>> filepath = join(get_data_folder(), "rel-bin", "plink2.rel.bin") >>> id_filepath = join(get_data_folder(), "rel-bin", "plink2.rel.id") >>> K = read_rel(filepath, id_filepath) >>> print(K) <xarray.DataArray (sample_0: 10, sample_1: 10)> array([[ 0.89, 0.23, -0.19, -0.01, -0.14, 0.29, 0.27, -0.23, -0.10, -0.21], [ 0.23, 1.08, -0.45, 0.19, -0.19, 0.17, 0.41, -0.01, -0.13, -0.13], [-0.19, -0.45, 1.18, -0.04, -0.15, -0.20, -0.31, -0.04, 0.30, -0.01], [-0.01, 0.19, -0.04, 0.90, -0.07, 0.01, 0.06, -0.19, -0.09, 0.17], [-0.14, -0.19, -0.15, -0.07, 1.18, 0.09, -0.03, 0.10, 0.22, 0.17], [ 0.29, 0.17, -0.20, 0.01, 0.09, 0.96, 0.07, -0.04, -0.09, -0.23], [ 0.27, 0.41, -0.31, 0.06, -0.03, 0.07, 0.71, -0.10, -0.09, -0.06], [-0.23, -0.01, -0.04, -0.19, 0.10, -0.04, -0.10, 1.42, -0.30, -0.07], [-0.10, -0.13, 0.30, -0.09, 0.22, -0.09, -0.09, -0.30, 0.91, -0.02], [-0.21, -0.13, -0.01, 0.17, 0.17, -0.23, -0.06, -0.07, -0.02, 0.91]]) Coordinates: * sample_0 (sample_0) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753' * sample_1 (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753' fid (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753' iid (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753' Parameters ---------- filepath : str Path to the matrix file. id_filepath : str, optional Path to the file containing family and individual IDs. It defaults to :const:`None`, in which case it will try to be inferred. Returns ------- rel : :class:`xarray.DataArray` Realized relationship matrix. References ---------- .. [b] Read PLINK realized relationship matrix files. https://www.cog-genomics.org/plink/2.0/distance """ ft = file_type(filepath) if ft == "zstd": return _read_rel_zs(filepath, id_filepath) elif ft == "bin": return _read_rel_bin(filepath, id_filepath) return _read_rel(filepath, id_filepath)
def _read_rel(filepath, id_filepath): df = _read_id_file(id_filepath, filepath) K = _1d_to_2d(_read_rel_file(filepath), df.shape[0]) return _data_array(K, df) def _read_rel_bin(filepath, id_filepath): from numpy import float64, fromfile df = _read_id_file(id_filepath, filepath) K = fromfile(filepath, dtype=float64) n = df.shape[0] K = K.reshape((n, n)) return _data_array(K, df) def _read_rel_file(filepath): from numpy import float64 rows = [] with open(filepath, "r") as f: for row in f: rows += [float64(v) for v in row.strip().split("\t")] return rows def _read_rel_zs_rows(filepath, chunk_size=8 * 1000 * 1000): from zstandard import ZstdDecompressor with open(filepath, "rb") as fh: ctx = ZstdDecompressor() with ctx.stream_reader(fh) as reader: over = False chunks = [] rows = [] while not over: have_row = False while not have_row: chunk = reader.read(chunk_size) if not chunk: over = True break if b"\n" in chunk: have_row = True chunks.append(chunk) (new_rows, semi_row) = _consume_rows(chunks) rows += new_rows chunks = [semi_row] return rows def _consume_rows(chunks): from numpy import float64 chunk = b"".join(chunks) rows = chunk.split(b"\n") semi_row = rows[-1] rows = [[float64(v) for v in r.split(b"\t")] for r in rows[:-1]] return (rows, semi_row) def _read_rel_zs(filepath, id_filepath): df = _read_id_file(id_filepath, filepath) rows = _read_rel_zs_rows(filepath) K = _1d_to_2d([v for r in rows for v in r], df.shape[0]) return _data_array(K, df) def _data_array(K, df): from xarray import DataArray coords = (df.iloc[:, 1], df.iloc[:, 1]) K = DataArray(K, dims=["sample_0", "sample_1"], coords=coords) K = K.assign_coords(**{"fid": ("sample_0", df.iloc[:, 0])}) K = K.assign_coords(**{"fid": ("sample_1", df.iloc[:, 0])}) K = K.assign_coords(**{"iid": ("sample_0", df.iloc[:, 1])}) K = K.assign_coords(**{"iid": ("sample_1", df.iloc[:, 1])}) return K def _read_id_file(id_filepath, filepath): from pandas import read_csv if filepath.endswith(".gz"): basename = filepath[:-3] elif filepath.endswith(".bin"): basename = filepath[:-4] elif filepath.endswith(".zst"): basename = filepath[:-4] else: basename = filepath if id_filepath is None: id_filepath = basename + ".id" return read_csv(id_filepath, sep="\t", header=None, comment="#") def _1d_to_2d(values, n): from numpy import tril, tril_indices_from, zeros K = zeros((n, n)) K[tril_indices_from(K)] = values K = K + tril(K, -1).T return K