Source code for pandas_plink._read

import warnings
from collections import OrderedDict as odict
from glob import glob
from os.path import basename, dirname, join
from typing import Optional

from pandas import DataFrame, read_csv
from xarray import DataArray

from ._allele import Allele
from ._bed_read import read_bed
from ._chunk import Chunk
from ._util import last_replace

__all__ = ["read_plink", "read_plink1_bin"]





[docs]def read_plink1_bin( bed: str, bim: Optional[str] = None, fam: Optional[str] = None, verbose: bool = True, ref: str = "a1", chunk: Chunk = Chunk(), ) -> DataArray: """ Read PLINK 1 binary files [a]_ into a data array. A PLINK 1 binary file set consists of three files: - BED: containing the genotype. - BIM: containing variant information. - FAM: containing sample information. The user might provide a single file path to a BED file, from which this function will try to infer the file path of the other two files. This function also allows the user to provide file path to multiple BED and BIM files, as it is common to have a data set split into multiple files, one per chromosome. This function returns a samples-by-variants matrix. This is a special kind of matrix with rows and columns having multiple coordinates each. Those coordinates have the metainformation contained in the BIM and FAM files. Examples -------- The following example reads two BED files and two BIM files correspondig to chromosomes 11 and 12, and read a single FAM file whose filename is inferred from the BED filenames. .. doctest:: >>> from os.path import join >>> from pandas_plink import read_plink1_bin >>> from pandas_plink import get_data_folder >>> G = read_plink1_bin(join(get_data_folder(), "chr*.bed"), verbose=False) >>> print(G) <xarray.DataArray 'genotype' (sample: 14, variant: 1252)> dask.array<concatenate, shape=(14, 1252), dtype=float32, chunksize=(14, 779), chunktype=numpy.ndarray> Coordinates: (12/14) * sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' * variant (variant) <U11 'variant0' 'variant1' ... 'variant1251' fid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' iid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' father (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' mother (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' ... ... chrom (variant) object '11' '11' '11' '11' '11' ... '12' '12' '12' '12' snp (variant) object '316849996' '316874359' ... '373081507' cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 pos (variant) int32 157439 181802 248969 ... 27163741 27205125 27367844 a0 (variant) object 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'G' 'A' 'T' 'G' a1 (variant) object 'T' 'C' 'C' 'T' 'T' 'A' ... 'G' 'A' 'T' 'C' 'A' >>> print(G.shape) (14, 1252) Suppose we want the genotypes of the chromosome 11 only: .. doctest:: >>> G = G.where(G.chrom == "11", drop=True) >>> print(G) <xarray.DataArray 'genotype' (sample: 14, variant: 779)> dask.array<where, shape=(14, 779), dtype=float32, chunksize=(14, 779), chunktype=numpy.ndarray> Coordinates: (12/14) * sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' * variant (variant) <U11 'variant0' 'variant1' ... 'variant777' 'variant778' fid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' iid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014' father (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' mother (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0' ... ... chrom (variant) object '11' '11' '11' '11' '11' ... '11' '11' '11' '11' snp (variant) object '316849996' '316874359' ... '345698259' cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 pos (variant) int32 157439 181802 248969 ... 28937375 28961091 29005702 a0 (variant) object 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'C' 'A' 'A' 'T' a1 (variant) object 'T' 'C' 'C' 'T' 'T' 'A' ... 'G' 'T' 'G' 'C' 'C' >>> print(G.shape) (14, 779) Lets now print the genotype value of the sample `B003` for variant `variant5`: .. doctest:: >>> print(G.sel(sample="B003", variant="variant5").values) 0.0 The special matrix we return is of type :class:`xarray.DataArray`. More information about it can be found at the `xarray documentation <http://xarray.pydata.org>`_. Parameters ---------- bed Path to a BED file. It can contain shell-style wildcards to indicate multiple BED files. bim Path to a BIM file. It can contain shell-style wildcards to indicate multiple BIM files. It defaults to :const:`None`, in which case it will try to be inferred. fam Path to a FAM file. It defaults to :const:`None`, in which case it will try to be inferred. verbose :const:`True` for progress information; :const:`False` otherwise. ref Reference allele. Specify which allele the dosage matrix will count. It can be either :const:`"a1"` (default) or :const:`"a0"`. chunk Data chunk specification. Useful to adjust the trade-off between computational overhead and IO usage. See :class:`pandas_plink.Chunk`. Returns ------- :class:`xarray.DataArray` Genotype with metadata. References ---------- .. [a] PLINK 1 binary. https://www.cog-genomics.org/plink/2.0/input#bed """ import dask.array as da import pandas as pd from tqdm import tqdm bed_files = sorted(glob(bed)) if len(bed_files) == 0: raise ValueError("No BED file has been found.") if bim is None: bim_files = [last_replace(f, ".bed", ".bim") for f in bed_files] else: bim_files = sorted(glob(bim)) if len(bim_files) == 0: raise ValueError("No BIM file has been found.") if fam is None: fam_files = [last_replace(f, ".bed", ".fam") for f in bed_files] else: fam_files = sorted(glob(fam)) if len(fam_files) == 0: raise ValueError("No FAM file has been found.") if len(bed_files) != len(bim_files): raise ValueError("The numbers of BED and BIM files must match.") if len(fam_files) > 1: msg = "More than one FAM file has been specified. Only the first one will be " msg += "considered." if verbose: warnings.warn(msg, UserWarning) fam_files = fam_files[:1] nfiles = len(bed_files) + len(bim_files) + 1 pbar = tqdm(desc="Mapping files", total=nfiles, disable=not verbose) bims = _read_file(bim_files, lambda f: _read_bim_noi(f), pbar) nmarkers = {bed_files[i]: b.shape[0] for i, b in enumerate(bims)} bim_df = pd.concat(bims, axis=0, ignore_index=True) fam_df = _read_file(fam_files, lambda f: _read_fam_noi(f), pbar)[0] nsamples = fam_df.shape[0] sample_ids = fam_df["iid"] nvariants = bim_df.shape[0] variant_ids = [f"variant{i}" for i in range(nvariants)] if ref == "a1": ref_al = Allele.a1 elif ref == "a0": ref_al = Allele.a0 else: raise ValueError("Unknown reference allele.") G = _read_file( bed_files, lambda f: _read_bed(f, nsamples, nmarkers[f], ref_al, chunk), pbar ) G = da.concatenate(G, axis=1) G = DataArray(G, dims=["sample", "variant"], coords=[sample_ids, variant_ids]) sample = {c: ("sample", fam_df[c]) for c in fam_df.columns} variant = {c: ("variant", bim_df[c]) for c in iter(bim_df.columns)} G = G.assign_coords(**sample) G = G.assign_coords(**variant) G.name = "genotype" pbar.close() return G
def _read_file(fn, read_func, pbar): data = [] for f in fn: data.append(read_func(f)) pbar.update(1) return data def _read_csv(fn, header) -> DataFrame: df = read_csv( fn, delim_whitespace=True, header=None, names=list(header.keys()), dtype=header, compression=None, engine="c", iterator=False, ) assert isinstance(df, DataFrame) return df def _read_bim(fn): df = _read_bim_noi(fn) df["i"] = range(df.shape[0]) return df def _read_bim_noi(fn): from ._type import bim header = odict( [ ("chrom", bim["chrom"]), ("snp", bim["snp"]), ("cm", bim["cm"]), ("pos", bim["pos"]), ("a0", bim["a0"]), ("a1", bim["a1"]), ] ) return _read_csv(fn, header) def _read_fam(fn): df = _read_fam_noi(fn) df["i"] = range(df.shape[0]) return df def _read_fam_noi(fn): from ._type import fam header = odict( [ ("fid", fam["fid"]), ("iid", fam["iid"]), ("father", fam["father"]), ("mother", fam["mother"]), ("gender", fam["gender"]), ("trait", fam["trait"]), ] ) return _read_csv(fn, header) def _read_bed(fn, nsamples, nvariants, ref: Allele, chunk: Chunk): _check_bed_header(fn) major = _major_order(fn) # Assume major == "variant". nrows = nvariants ncols = nsamples row_chunk = nrows if chunk.nvariants is None else min(nrows, chunk.nvariants) col_chunk = ncols if chunk.nsamples is None else min(ncols, chunk.nsamples) if major == "sample": nrows, ncols = ncols, nrows row_chunk, col_chunk = col_chunk, row_chunk max_npartitions = 16_384 row_chunk = max(nrows // max_npartitions, row_chunk) col_chunk = max(ncols // max_npartitions, col_chunk) G = read_bed(fn, nrows, ncols, row_chunk, col_chunk, ref) if major == "variant": G = G.T return G def _check_bed_header(fn): with open(fn, "rb") as f: arr = f.read(2) if len(arr) < 2: raise ValueError("Couldn't read BED header: %s." % fn) ok = arr[0] == 108 and arr[1] == 27 if not ok: raise ValueError("Invalid BED file: %s." % fn) def _major_order(fn): """ Major order. Variant-major lists all samples for first variant, all samples for second variant, and so on. Sample-major lists all variants for first sample, all variants for second sample, and so on. """ with open(fn, "rb") as f: f.seek(2) arr = f.read(1) if len(arr) < 1: raise ValueError("Couldn't read column order: %s." % fn) if arr[0] == 1: return "variant" elif arr[0] == 0: return "sample" msg = "Invalid matrix layout. Maybe it is a PLINK2 file?" msg += " PLINK2 is not supported yet." raise ValueError(msg) def _clean_prefixes(prefixes): paths = [] for p in prefixes: dirn = dirname(p) basen = basename(p) base = ".".join(basen.split(".")[:-1]) if len(base) == 0: path = p else: ext = basen.split(".")[-1] if ext not in ["bed", "fam", "bim", "nosex", "log"]: base += "." + ext path = join(dirn, base) paths.append(path) return list(set(paths))