Simple python clone of gnu datamash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: UTF8 -*- | |
# Author: Guillaume Bouvier -- guillaume.bouvier@pasteur.fr | |
# https://research.pasteur.fr/en/member/guillaume-bouvier/ | |
# 2017-09-19 10:27:58 (UTC+0200) | |
import sys | |
from optparse import OptionParser | |
import numpy | |
import pandas | |
from prettytable import PrettyTable | |
def prettyprint(keys, results, operations, size=None): | |
if size is None: | |
data = keys[:, None] | |
else: | |
data = numpy.repeat(keys, size)[:, None] | |
fields = ["key", ] | |
for k in operations: | |
if results[k].size > 0: | |
if data is not None: | |
data = numpy.c_[data, results[k]] | |
else: | |
data = results[k] | |
n = results[k].shape[1] | |
fields.extend(["%s%s"%e for e in zip([k, ]*n, range(n))]) | |
x = PrettyTable(fields) | |
for row in data: | |
x.add_row(list(row)) | |
print x.get_string(border=False, header=True) | |
def prettyprint_array(A): | |
""" | |
Pretty print of an array (A) | |
""" | |
x = PrettyTable(A.dtype.names, header=False,border=False) | |
for row in A: | |
x.add_row(row) | |
print x | |
if __name__ == '__main__': | |
parser = OptionParser() | |
parser.add_option("-g", "--group", dest="group", default=None, type=int, | |
metavar="X[,Y,Z]", | |
help="group via fields X,[Y,Z]") | |
parser.add_option("--slice", dest="slice", default=None, type=int, | |
metavar="X", | |
help="Create slices of given size instead of using group option") | |
parser.add_option("--nslice", dest="nslice", default=None, type=int, | |
metavar="N", | |
help="Create N slices instead of using group option") | |
parser.add_option("--moving_average", dest="moving_average", default=None, type=int, | |
metavar="X", | |
help="Create a moving_average using the given window size. Return an array with the rolling mean and the rolling std for each column") | |
parser.add_option("--mean", dest="mean", default=False, action="store_true", | |
help="Compute the mean on the grouped values") | |
parser.add_option("--std", dest="std", default=False, action="store_true", | |
help="Compute the standard deviation of the grouped values") | |
parser.add_option("--se", dest="se", default=False, action="store_true", | |
help="Compute the standard error of the grouped values") | |
parser.add_option("--min", dest="min", default=False, action="store_true", | |
help="Get the minimum of the grouped values") | |
parser.add_option("--max", dest="max", default=False, action="store_true", | |
help="Get the maximum of the grouped values") | |
parser.add_option("--median", dest="median", default=False, action="store_true", | |
help="Get the median of the grouped values") | |
parser.add_option("--argmin", dest="argmin", default=False, action="store_true", | |
help="Get the argmin of the grouped values") | |
parser.add_option("--argmax", dest="argmax", default=False, action="store_true", | |
help="Get the argmax of the grouped values") | |
parser.add_option("--sum", dest="sum", default=False, action="store_true", | |
help="Compute the sum of the grouped values") | |
parser.add_option("--hist", dest="hist", default=None, type=int, metavar="N", | |
help="Get the histogram of the grouped values with the given number of bins N") | |
parser.add_option("--rand", dest="rand", default=False, action="store_true", | |
help="randomize the lines of the grouped values") | |
parser.add_option("--prob", dest="prob", default=None, type=int, | |
metavar="X[,Y,Z]", | |
help="The probabilities associated with each entry in the data. If not given the sample assumes a uniform distribution over all entries in the data.") | |
parser.add_option("--size", dest="size", default=None, type=int, | |
metavar="X", | |
help="The number of random lines to return. The default is 1.") | |
parser.add_option("--exclude", dest="exclude", default=None, type=int, | |
metavar=0, | |
help="Exclude the given column (zero-based numbering) from reading.") | |
parser.add_option("-t", "--field-separator", dest="delimiter", | |
default=None, type=str, metavar=":", | |
help="The string used to separate values. By default, any consecutive whitespaces act as delimiter.") | |
parser.add_option("-T", "--transpose", dest="transpose", default=False, | |
action="store_true", help="Transpose input") | |
parser.add_option("--header", dest="header", default=False, | |
action="store_true", help="First row is considered as header.") | |
(options, args) = parser.parse_args() | |
operations = ['mean', 'std', 'se', 'min', 'max', 'median', 'argmin', 'argmax', | |
'sum', 'hist', 'rand'] | |
data = numpy.genfromtxt(sys.stdin, invalid_raise=False, dtype=str, | |
delimiter=options.delimiter) | |
if options.exclude is not None: | |
data = numpy.delete(data, [options.exclude], axis=1) | |
if options.header: | |
data = data[1:] | |
if options.slice is not None: | |
slice_size = options.slice | |
slice_number = int(numpy.ceil(data.shape[0]/float(slice_size))) | |
if options.nslice is not None: | |
slice_number = options.nslice | |
slice_size = int(numpy.ceil(data.shape[0]/float(slice_number))) | |
if options.nslice is not None or options.slice is not None: | |
slicer = numpy.repeat(range(slice_number), slice_size)[:data.shape[0]] | |
data = numpy.c_[slicer, data] | |
options.group = 0 | |
if options.moving_average is not None: | |
data = numpy.float_(data) | |
ws = options.moving_average # window size | |
ma_array = [] # array to store moving average results | |
for colid in range(data.shape[1]): | |
ts = pandas.Series(data[:, colid]) | |
ma_array.append(ts.rolling(window=ws,center=True).mean()) | |
ma_array.append(ts.rolling(window=ws,center=True).std()) | |
ma_array = numpy.asarray(ma_array).T | |
data = ma_array[ws/2:-ws/2] | |
prettyprint_array(data) | |
sys.exit() | |
if options.transpose: | |
for row in data.T: | |
for e in row: | |
sys.stdout.write('%s\t'%e) | |
sys.stdout.write('\n') | |
sys.exit(0) | |
col_labels = numpy.arange(data.shape[1]) # Keep column ids | |
if options.group is None: | |
# No group. Apply the operation to whole dataset | |
keys = numpy.asarray([0, ]) | |
labels = numpy.zeros(data.shape[0]) | |
col_selection = numpy.ones(data.shape[1], dtype=bool) | |
else: | |
keys = numpy.unique(data[:, options.group]) | |
labels = data[:, options.group] | |
try: | |
keys = numpy.float_(keys) | |
labels = numpy.float_(labels) | |
except ValueError: | |
pass | |
if keys.size > 1: | |
keys.sort() | |
else: | |
keys = numpy.asarray([keys]) | |
col_selection = (numpy.arange(data.shape[1]) != options.group) | |
results = {k: [] for k in operations} | |
data = numpy.float_(data[:, col_selection]) | |
col_labels = col_labels[col_selection] # Keep track of the column labels | |
for k in keys: | |
row_selection = (labels == k) | |
if options.mean: | |
results['mean'].append(data[row_selection, :].mean(axis=0)) | |
if options.std: | |
results['std'].append(data[row_selection, :].std(axis=0)) | |
if options.se: | |
results['se'].append(data[row_selection, :].std(axis=0)/numpy.sqrt(row_selection.sum())) | |
if options.min: | |
results['min'].append(data[row_selection, :].min(axis=0)) | |
if options.max: | |
results['max'].append(data[row_selection, :].max(axis=0)) | |
if options.median: | |
results['median'].append(numpy.median(data[row_selection, :], axis=0)) | |
if options.argmin: | |
results['argmin'].append(data[row_selection, :].argmin(axis=0)) | |
if options.argmax: | |
results['argmax'].append(data[row_selection, :].argmax(axis=0)) | |
if options.sum: | |
results['sum'].append(data[row_selection, :].sum(axis=0)) | |
if options.hist is not None: | |
for row in data[row_selection, :].T: | |
hist = numpy.histogram(row, bins=options.hist, density=True) | |
print ' '.join(['%.4g %.4g\n'%_ for _ in zip(hist[1], hist[0])]) | |
results['hist'].append(hist) | |
if options.rand: | |
n = data[row_selection, :].shape[0] | |
if options.prob is None: | |
p = None | |
else: | |
p = data[row_selection, :][:, col_labels==options.prob] | |
p = numpy.squeeze(p) | |
p /= p.sum() | |
randsample = data[row_selection, :][numpy.random.choice(n, p=p, size=options.size, replace=False)] | |
if options.size is None: | |
randsample = randsample[None, :] | |
for randsample_ in randsample: | |
results['rand'].append(randsample_) | |
results = {k: numpy.asarray(v) for k,v in results.items()} | |
prettyprint(keys, results, operations, options.size) |