import warnings
from collections import OrderedDict as odict
from glob import glob
from os.path import basename, dirname, join
from typing import Optional
from pandas import DataFrame, read_csv
from xarray import DataArray
from ._allele import Allele
from ._bed_read import read_bed
from ._chunk import Chunk
from ._util import last_replace
__all__ = ["read_plink", "read_plink1_bin"]
[docs]def read_plink(file_prefix, verbose=True):
"""
Read PLINK files into data frames.
.. note::
The function :func:`pandas_plink.read_plink1_bin` provides an alternative
interface to read the same files.
The genotype values can be either :const:`0`, :const:`1`, :const:`2`, or
:data:`math.nan`:
- :const:`0` Homozygous having the first allele (given by coordinate **a0**)
- :const:`1` Heterozygous
- :const:`2` Homozygous having the second allele (given by coordinate **a1**)
- :data:`math.nan` Missing genotype
Examples
--------
The following example reads two BED files and two BIM files correspondig to
chromosomes 11 and 12, and read a single FAM file whose filename is inferred from
the BED filenames.
.. doctest::
>>> from os.path import join
>>> from pandas_plink import read_plink
>>> from pandas_plink import get_data_folder
>>> (bim, fam, bed) = read_plink(join(get_data_folder(), "chr*.bed"),
... verbose=False)
>>> print(bim.head())
chrom snp cm pos a0 a1 i
0 11 316849996 0.00 157439 C T 0
1 11 316874359 0.00 181802 G C 1
2 11 316941526 0.00 248969 G C 2
3 11 317137620 0.00 445063 C T 3
4 11 317534352 0.00 841795 C T 4
>>> print(fam.head())
fid iid father mother gender trait i
0 B001 B001 0 0 0 -9 0
1 B002 B002 0 0 0 -9 1
2 B003 B003 0 0 0 -9 2
3 B004 B004 0 0 0 -9 3
4 B005 B005 0 0 0 -9 4
>>> print(bed.compute())
[[0.00 0.00 0.00 ... 2.00 2.00 0.00]
[0.00 1.00 0.00 ... 2.00 1.00 0.00]
[2.00 2.00 2.00 ... 0.00 0.00 2.00]
...
[2.00 0.00 0.00 ... 2.00 2.00 1.00]
[2.00 0.00 0.00 ... 2.00 2.00 0.00]
[0.00 nan 0.00 ... 1.00 2.00 0.00]]
Parameters
----------
file_prefix : str
Path prefix to the set of PLINK files. It supports loading many BED files at
once using globstrings wildcard.
verbose : bool
:const:`True` for progress information; :const:`False` otherwise.
Returns
-------
alleles : :class:`pandas.DataFrame`
Alleles.
samples : :class:`pandas.DataFrame`
Samples.
genotypes : :class:`dask.array.Array`
Genotype.
"""
import pandas as pd
from dask.array import concatenate
from tqdm import tqdm
file_prefixes = sorted(glob(file_prefix))
if len(file_prefixes) == 0:
file_prefixes = [file_prefix.replace("*", "")]
file_prefixes = sorted(_clean_prefixes(file_prefixes))
fn = []
for fp in file_prefixes:
fn.append({s: "%s.%s" % (fp, s) for s in ["bed", "bim", "fam"]})
pbar = tqdm(desc="Mapping files", total=3 * len(fn), disable=not verbose)
bim = _read_file(fn, lambda fn: _read_bim(fn["bim"]), pbar)
if len(file_prefixes) > 1:
if verbose:
msg = "Multiple files read in this order: {}"
print(msg.format([basename(f) for f in file_prefixes]))
nmarkers = dict()
index_offset = 0
for i, bi in enumerate(bim):
nmarkers[fn[i]["bed"]] = bi.shape[0]
bi["i"] += index_offset
index_offset += bi.shape[0]
bim = pd.concat(bim, axis=0, ignore_index=True)
fam = _read_file([fn[0]], lambda fn: _read_fam(fn["fam"]), pbar)[0]
nsamples = fam.shape[0]
ref = Allele.a1
bed = _read_file(
fn,
lambda f: _read_bed(f["bed"], nsamples, nmarkers[f["bed"]], ref, Chunk()).T,
pbar,
)
bed = concatenate(bed, axis=0)
pbar.close()
return (bim, fam, bed)
[docs]def read_plink1_bin(
bed: str,
bim: Optional[str] = None,
fam: Optional[str] = None,
verbose: bool = True,
ref: str = "a1",
chunk: Chunk = Chunk(),
) -> DataArray:
"""
Read PLINK 1 binary files [a]_ into a data array.
A PLINK 1 binary file set consists of three files:
- BED: containing the genotype.
- BIM: containing variant information.
- FAM: containing sample information.
The user might provide a single file path to a BED file, from which this function
will try to infer the file path of the other two files.
This function also allows the user to provide file path to multiple BED and
BIM files, as it is common to have a data set split into multiple files, one per
chromosome.
This function returns a samples-by-variants matrix. This is a special kind of matrix
with rows and columns having multiple coordinates each. Those coordinates have the
metainformation contained in the BIM and FAM files.
Examples
--------
The following example reads two BED files and two BIM files correspondig to
chromosomes 11 and 12, and read a single FAM file whose filename is inferred from
the BED filenames.
.. doctest::
>>> from os.path import join
>>> from pandas_plink import read_plink1_bin
>>> from pandas_plink import get_data_folder
>>> G = read_plink1_bin(join(get_data_folder(), "chr*.bed"), verbose=False)
>>> print(G)
<xarray.DataArray 'genotype' (sample: 14, variant: 1252)>
dask.array<concatenate, shape=(14, 1252), dtype=float32, chunksize=(14, 779), chunktype=numpy.ndarray>
Coordinates: (12/14)
* sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
* variant (variant) <U11 'variant0' 'variant1' ... 'variant1251'
fid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
iid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
father (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
mother (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
... ...
chrom (variant) object '11' '11' '11' '11' '11' ... '12' '12' '12' '12'
snp (variant) object '316849996' '316874359' ... '373081507'
cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0
pos (variant) int32 157439 181802 248969 ... 27163741 27205125 27367844
a0 (variant) object 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'G' 'A' 'T' 'G'
a1 (variant) object 'T' 'C' 'C' 'T' 'T' 'A' ... 'G' 'A' 'T' 'C' 'A'
>>> print(G.shape)
(14, 1252)
Suppose we want the genotypes of the chromosome 11 only:
.. doctest::
>>> G = G.where(G.chrom == "11", drop=True)
>>> print(G)
<xarray.DataArray 'genotype' (sample: 14, variant: 779)>
dask.array<where, shape=(14, 779), dtype=float32, chunksize=(14, 779), chunktype=numpy.ndarray>
Coordinates: (12/14)
* sample (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
* variant (variant) <U11 'variant0' 'variant1' ... 'variant777' 'variant778'
fid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
iid (sample) object 'B001' 'B002' 'B003' ... 'B012' 'B013' 'B014'
father (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
mother (sample) object '0' '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0' '0'
... ...
chrom (variant) object '11' '11' '11' '11' '11' ... '11' '11' '11' '11'
snp (variant) object '316849996' '316874359' ... '345698259'
cm (variant) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0
pos (variant) int32 157439 181802 248969 ... 28937375 28961091 29005702
a0 (variant) object 'C' 'G' 'G' 'C' 'C' 'T' ... 'A' 'C' 'A' 'A' 'T'
a1 (variant) object 'T' 'C' 'C' 'T' 'T' 'A' ... 'G' 'T' 'G' 'C' 'C'
>>> print(G.shape)
(14, 779)
Lets now print the genotype value of the sample `B003` for variant `variant5`:
.. doctest::
>>> print(G.sel(sample="B003", variant="variant5").values)
0.0
The special matrix we return is of type :class:`xarray.DataArray`. More information
about it can be found at the `xarray documentation <http://xarray.pydata.org>`_.
Parameters
----------
bed
Path to a BED file. It can contain shell-style wildcards to indicate multiple
BED files.
bim
Path to a BIM file. It can contain shell-style wildcards to indicate multiple
BIM files. It defaults to :const:`None`, in which case it will try to be inferred.
fam
Path to a FAM file. It defaults to :const:`None`, in which case it will try to be
inferred.
verbose
:const:`True` for progress information; :const:`False` otherwise.
ref
Reference allele. Specify which allele the dosage matrix will count. It can
be either :const:`"a1"` (default) or :const:`"a0"`.
chunk
Data chunk specification. Useful to adjust the trade-off between computational
overhead and IO usage. See :class:`pandas_plink.Chunk`.
Returns
-------
:class:`xarray.DataArray`
Genotype with metadata.
References
----------
.. [a] PLINK 1 binary. https://www.cog-genomics.org/plink/2.0/input#bed
"""
import dask.array as da
import pandas as pd
from tqdm import tqdm
bed_files = sorted(glob(bed))
if len(bed_files) == 0:
raise ValueError("No BED file has been found.")
if bim is None:
bim_files = [last_replace(f, ".bed", ".bim") for f in bed_files]
else:
bim_files = sorted(glob(bim))
if len(bim_files) == 0:
raise ValueError("No BIM file has been found.")
if fam is None:
fam_files = [last_replace(f, ".bed", ".fam") for f in bed_files]
else:
fam_files = sorted(glob(fam))
if len(fam_files) == 0:
raise ValueError("No FAM file has been found.")
if len(bed_files) != len(bim_files):
raise ValueError("The numbers of BED and BIM files must match.")
if len(fam_files) > 1:
msg = "More than one FAM file has been specified. Only the first one will be "
msg += "considered."
if verbose:
warnings.warn(msg, UserWarning)
fam_files = fam_files[:1]
nfiles = len(bed_files) + len(bim_files) + 1
pbar = tqdm(desc="Mapping files", total=nfiles, disable=not verbose)
bims = _read_file(bim_files, lambda f: _read_bim_noi(f), pbar)
nmarkers = {bed_files[i]: b.shape[0] for i, b in enumerate(bims)}
bim_df = pd.concat(bims, axis=0, ignore_index=True)
fam_df = _read_file(fam_files, lambda f: _read_fam_noi(f), pbar)[0]
nsamples = fam_df.shape[0]
sample_ids = fam_df["iid"]
nvariants = bim_df.shape[0]
variant_ids = [f"variant{i}" for i in range(nvariants)]
if ref == "a1":
ref_al = Allele.a1
elif ref == "a0":
ref_al = Allele.a0
else:
raise ValueError("Unknown reference allele.")
G = _read_file(
bed_files, lambda f: _read_bed(f, nsamples, nmarkers[f], ref_al, chunk), pbar
)
G = da.concatenate(G, axis=1)
G = DataArray(G, dims=["sample", "variant"], coords=[sample_ids, variant_ids])
sample = {c: ("sample", fam_df[c]) for c in fam_df.columns}
variant = {c: ("variant", bim_df[c]) for c in iter(bim_df.columns)}
G = G.assign_coords(**sample)
G = G.assign_coords(**variant)
G.name = "genotype"
pbar.close()
return G
def _read_file(fn, read_func, pbar):
data = []
for f in fn:
data.append(read_func(f))
pbar.update(1)
return data
def _read_csv(fn, header) -> DataFrame:
df = read_csv(
fn,
delim_whitespace=True,
header=None,
names=list(header.keys()),
dtype=header,
compression=None,
engine="c",
iterator=False,
)
assert isinstance(df, DataFrame)
return df
def _read_bim(fn):
df = _read_bim_noi(fn)
df["i"] = range(df.shape[0])
return df
def _read_bim_noi(fn):
from ._type import bim
header = odict(
[
("chrom", bim["chrom"]),
("snp", bim["snp"]),
("cm", bim["cm"]),
("pos", bim["pos"]),
("a0", bim["a0"]),
("a1", bim["a1"]),
]
)
return _read_csv(fn, header)
def _read_fam(fn):
df = _read_fam_noi(fn)
df["i"] = range(df.shape[0])
return df
def _read_fam_noi(fn):
from ._type import fam
header = odict(
[
("fid", fam["fid"]),
("iid", fam["iid"]),
("father", fam["father"]),
("mother", fam["mother"]),
("gender", fam["gender"]),
("trait", fam["trait"]),
]
)
return _read_csv(fn, header)
def _read_bed(fn, nsamples, nvariants, ref: Allele, chunk: Chunk):
_check_bed_header(fn)
major = _major_order(fn)
# Assume major == "variant".
nrows = nvariants
ncols = nsamples
row_chunk = nrows if chunk.nvariants is None else min(nrows, chunk.nvariants)
col_chunk = ncols if chunk.nsamples is None else min(ncols, chunk.nsamples)
if major == "sample":
nrows, ncols = ncols, nrows
row_chunk, col_chunk = col_chunk, row_chunk
max_npartitions = 16_384
row_chunk = max(nrows // max_npartitions, row_chunk)
col_chunk = max(ncols // max_npartitions, col_chunk)
G = read_bed(fn, nrows, ncols, row_chunk, col_chunk, ref)
if major == "variant":
G = G.T
return G
def _check_bed_header(fn):
with open(fn, "rb") as f:
arr = f.read(2)
if len(arr) < 2:
raise ValueError("Couldn't read BED header: %s." % fn)
ok = arr[0] == 108 and arr[1] == 27
if not ok:
raise ValueError("Invalid BED file: %s." % fn)
def _major_order(fn):
"""
Major order.
Variant-major lists all samples for first variant, all samples for second
variant, and so on. Sample-major lists all variants for first sample, all
variants for second sample, and so on.
"""
with open(fn, "rb") as f:
f.seek(2)
arr = f.read(1)
if len(arr) < 1:
raise ValueError("Couldn't read column order: %s." % fn)
if arr[0] == 1:
return "variant"
elif arr[0] == 0:
return "sample"
msg = "Invalid matrix layout. Maybe it is a PLINK2 file?"
msg += " PLINK2 is not supported yet."
raise ValueError(msg)
def _clean_prefixes(prefixes):
paths = []
for p in prefixes:
dirn = dirname(p)
basen = basename(p)
base = ".".join(basen.split(".")[:-1])
if len(base) == 0:
path = p
else:
ext = basen.split(".")[-1]
if ext not in ["bed", "fam", "bim", "nosex", "log"]:
base += "." + ext
path = join(dirn, base)
paths.append(path)
return list(set(paths))