Source code for pandas_plink.read

from __future__ import division, unicode_literals

import sys
from collections import OrderedDict as odict
from glob import glob
from os.path import basename, dirname, join
from pandas import concat
from pandas import read_csv

from tqdm import tqdm

from .bed_read import read_bed

PY3 = sys.version_info >= (3, )

if PY3:

    def _ord(x):
        return x
else:
    _ord = ord





def _read_file(fn, desc, read_func, pbar):
    data = []
    for f in fn:
        data.append(read_func(f))
        pbar.update(1)
    return data


def _read_csv(fn, header):
    return read_csv(
        fn,
        delim_whitespace=True,
        header=None,
        names=header.keys(),
        dtype=header,
        compression=None,
        engine='c')


def _read_bim(fn):
    header = odict([('chrom', bytes), ('snp', bytes), ('cm', float),
                    ('pos', int), ('a0', bytes), ('a1', bytes)])
    df = _read_csv(fn, header)

    df['chrom'] = df['chrom'].astype('category')
    df['a0'] = df['a0'].astype('category')
    df['a1'] = df['a1'].astype('category')
    df['i'] = range(df.shape[0])
    return df


def _read_fam(fn):
    header = odict([('fid', str), ('iid', str), ('father', str),
                    ('mother', str), ('gender', bytes), ('trait', str)])

    df = _read_csv(fn, header)

    df['gender'] = df['gender'].astype('category')
    df['i'] = range(df.shape[0])
    return df


def _read_bed(fn, nsamples, nmarkers):
    fn = _ascii_airlock(fn)

    _check_bed_header(fn)
    major = _major_order(fn)

    ncols = nmarkers if major == 'individual' else nsamples
    nrows = nmarkers if major == 'snp' else nsamples

    return read_bed(fn, nrows, ncols)


def _check_bed_header(fn):
    with open(fn, "rb") as f:
        arr = f.read(2)
        if len(arr) < 2:
            raise ValueError("Couldn't read BED header: %s." % fn)
        ok = _ord(arr[0]) == 108 and _ord(arr[1]) == 27
        if not ok:
            raise ValueError("Invalid BED file: %s." % fn)


def _major_order(fn):
    with open(fn, "rb") as f:
        f.seek(2)
        arr = f.read(1)
        if len(arr) < 1:
            raise ValueError("Couldn't read column order: %s." % fn)
        if _ord(arr[0]) == 1:
            return 'snp'
        elif _ord(arr[0]) == 0:
            return 'individual'
        raise ValueError("Couldn't understand matrix layout.")


def _ascii_airlock(v):
    if not isinstance(v, bytes):
        v = v.encode()
    return v


def _clean_prefixes(prefixes):
    paths = []
    for p in prefixes:
        dirn = dirname(p)
        basen = basename(p)
        base = '.'.join(basen.split('.')[:-1])
        if len(base) == 0:
            path = p
        else:
            ext = basen.split('.')[-1]
            if ext not in ['bed', 'fam', 'bim']:
                base += '.' + ext
            path = join(dirn, base)
        paths.append(path)
    return list(set(paths))