Source code for pandas_plink._read_rel

from ._filetype import file_type


[docs]def read_rel(filepath, id_filepath=None):
    """
    Read PLINK realized relationship matrix files [b]_.

    It supports plain text, binary, and compressed files.
    The usual file extensions for those types are `.rel`, `.rel.bin`, and `.rel.zst`,
    respectively.

    Example
    -------
    .. doctest::

        >>> from os.path import join
        >>> from pandas_plink import read_rel
        >>> from pandas_plink import get_data_folder
        >>> filepath = join(get_data_folder(), "rel-bin", "plink2.rel.bin")
        >>> id_filepath = join(get_data_folder(), "rel-bin", "plink2.rel.id")
        >>> K = read_rel(filepath, id_filepath)
        >>> print(K)
        <xarray.DataArray (sample_0: 10, sample_1: 10)>
        array([[ 0.89,  0.23, -0.19, -0.01, -0.14,  0.29,  0.27, -0.23, -0.10,
                -0.21],
               [ 0.23,  1.08, -0.45,  0.19, -0.19,  0.17,  0.41, -0.01, -0.13,
                -0.13],
               [-0.19, -0.45,  1.18, -0.04, -0.15, -0.20, -0.31, -0.04,  0.30,
                -0.01],
               [-0.01,  0.19, -0.04,  0.90, -0.07,  0.01,  0.06, -0.19, -0.09,
                 0.17],
               [-0.14, -0.19, -0.15, -0.07,  1.18,  0.09, -0.03,  0.10,  0.22,
                 0.17],
               [ 0.29,  0.17, -0.20,  0.01,  0.09,  0.96,  0.07, -0.04, -0.09,
                -0.23],
               [ 0.27,  0.41, -0.31,  0.06, -0.03,  0.07,  0.71, -0.10, -0.09,
                -0.06],
               [-0.23, -0.01, -0.04, -0.19,  0.10, -0.04, -0.10,  1.42, -0.30,
                -0.07],
               [-0.10, -0.13,  0.30, -0.09,  0.22, -0.09, -0.09, -0.30,  0.91,
                -0.02],
               [-0.21, -0.13, -0.01,  0.17,  0.17, -0.23, -0.06, -0.07, -0.02,
                 0.91]])
        Coordinates:
          * sample_0  (sample_0) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753'
          * sample_1  (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753'
            fid       (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753'
            iid       (sample_1) object 'HG00419' 'HG00650' ... 'NA20508' 'NA20753'


    Parameters
    ----------
    filepath : str
        Path to the matrix file.
    id_filepath : str, optional
        Path to the file containing family and individual IDs. It defaults to
        :const:`None`, in which case it will try to be inferred.

    Returns
    -------
    rel : :class:`xarray.DataArray`
        Realized relationship matrix.

    References
    ----------
    .. [b] Read PLINK realized relationship matrix files.
           https://www.cog-genomics.org/plink/2.0/distance
    """

    ft = file_type(filepath)

    if ft == "zstd":
        return _read_rel_zs(filepath, id_filepath)
    elif ft == "bin":
        return _read_rel_bin(filepath, id_filepath)

    return _read_rel(filepath, id_filepath)


def _read_rel(filepath, id_filepath):
    df = _read_id_file(id_filepath, filepath)
    K = _1d_to_2d(_read_rel_file(filepath), df.shape[0])

    return _data_array(K, df)


def _read_rel_bin(filepath, id_filepath):
    from numpy import float64, fromfile

    df = _read_id_file(id_filepath, filepath)
    K = fromfile(filepath, dtype=float64)
    n = df.shape[0]
    K = K.reshape((n, n))

    return _data_array(K, df)


def _read_rel_file(filepath):
    from numpy import float64

    rows = []
    with open(filepath, "r") as f:
        for row in f:
            rows += [float64(v) for v in row.strip().split("\t")]
    return rows


def _read_rel_zs_rows(filepath, chunk_size=8 * 1000 * 1000):
    from zstandard import ZstdDecompressor

    with open(filepath, "rb") as fh:
        ctx = ZstdDecompressor()
        with ctx.stream_reader(fh) as reader:
            over = False
            chunks = []
            rows = []
            while not over:
                have_row = False
                while not have_row:
                    chunk = reader.read(chunk_size)
                    if not chunk:
                        over = True
                        break
                    if b"\n" in chunk:
                        have_row = True
                    chunks.append(chunk)
                (new_rows, semi_row) = _consume_rows(chunks)
                rows += new_rows
                chunks = [semi_row]
    return rows


def _consume_rows(chunks):
    from numpy import float64

    chunk = b"".join(chunks)
    rows = chunk.split(b"\n")
    semi_row = rows[-1]
    rows = [[float64(v) for v in r.split(b"\t")] for r in rows[:-1]]
    return (rows, semi_row)


def _read_rel_zs(filepath, id_filepath):
    df = _read_id_file(id_filepath, filepath)
    rows = _read_rel_zs_rows(filepath)
    K = _1d_to_2d([v for r in rows for v in r], df.shape[0])
    return _data_array(K, df)


def _data_array(K, df):
    from xarray import DataArray

    coords = (df.iloc[:, 1], df.iloc[:, 1])
    K = DataArray(K, dims=["sample_0", "sample_1"], coords=coords)
    K = K.assign_coords(**{"fid": ("sample_0", df.iloc[:, 0])})
    K = K.assign_coords(**{"fid": ("sample_1", df.iloc[:, 0])})

    K = K.assign_coords(**{"iid": ("sample_0", df.iloc[:, 1])})
    K = K.assign_coords(**{"iid": ("sample_1", df.iloc[:, 1])})

    return K


def _read_id_file(id_filepath, filepath):
    from pandas import read_csv

    if filepath.endswith(".gz"):
        basename = filepath[:-3]
    elif filepath.endswith(".bin"):
        basename = filepath[:-4]
    elif filepath.endswith(".zst"):
        basename = filepath[:-4]
    else:
        basename = filepath

    if id_filepath is None:
        id_filepath = basename + ".id"

    return read_csv(id_filepath, sep="\t", header=None, comment="#")


def _1d_to_2d(values, n):
    from numpy import tril, tril_indices_from, zeros

    K = zeros((n, n))
    K[tril_indices_from(K)] = values
    K = K + tril(K, -1).T
    return K