Simple python clone of gnu datamash

#!/usr/bin/env python2
# -*- coding: UTF8 -*-
# Author: Guillaume Bouvier -- guillaume.bouvier@pasteur.fr
# https://research.pasteur.fr/en/member/guillaume-bouvier/
# 2017-09-19 10:27:58 (UTC+0200)
import sys
from optparse import OptionParser
import numpy
import pandas
from prettytable import PrettyTable
def prettyprint(keys, results, operations, size=None):
if size is None:
data = keys[:, None]
else:
data = numpy.repeat(keys, size)[:, None]
fields = ["key", ]
for k in operations:
if results[k].size > 0:
if data is not None:
data = numpy.c_[data, results[k]]
else:
data = results[k]
n = results[k].shape[1]
fields.extend(["%s%s"%e for e in zip([k, ]*n, range(n))])
x = PrettyTable(fields)
for row in data:
x.add_row(list(row))
print x.get_string(border=False, header=True)
def prettyprint_array(A):
"""
Pretty print of an array (A)
"""
x = PrettyTable(A.dtype.names, header=False,border=False)
for row in A:
x.add_row(row)
print x
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-g", "--group", dest="group", default=None, type=int,
metavar="X[,Y,Z]",
help="group via fields X,[Y,Z]")
parser.add_option("--slice", dest="slice", default=None, type=int,
metavar="X",
help="Create slices of given size instead of using group option")
parser.add_option("--nslice", dest="nslice", default=None, type=int,
metavar="N",
help="Create N slices instead of using group option")
parser.add_option("--moving_average", dest="moving_average", default=None, type=int,
metavar="X",
help="Create a moving_average using the given window size. Return an array with the rolling mean and the rolling std for each column")
parser.add_option("--mean", dest="mean", default=False, action="store_true",
help="Compute the mean on the grouped values")
parser.add_option("--std", dest="std", default=False, action="store_true",
help="Compute the standard deviation of the grouped values")
parser.add_option("--se", dest="se", default=False, action="store_true",
help="Compute the standard error of the grouped values")
parser.add_option("--min", dest="min", default=False, action="store_true",
help="Get the minimum of the grouped values")
parser.add_option("--max", dest="max", default=False, action="store_true",
help="Get the maximum of the grouped values")
parser.add_option("--median", dest="median", default=False, action="store_true",
help="Get the median of the grouped values")
parser.add_option("--argmin", dest="argmin", default=False, action="store_true",
help="Get the argmin of the grouped values")
parser.add_option("--argmax", dest="argmax", default=False, action="store_true",
help="Get the argmax of the grouped values")
parser.add_option("--sum", dest="sum", default=False, action="store_true",
help="Compute the sum of the grouped values")
parser.add_option("--hist", dest="hist", default=None, type=int, metavar="N",
help="Get the histogram of the grouped values with the given number of bins N")
parser.add_option("--rand", dest="rand", default=False, action="store_true",
help="randomize the lines of the grouped values")
parser.add_option("--prob", dest="prob", default=None, type=int,
metavar="X[,Y,Z]",
help="The probabilities associated with each entry in the data. If not given the sample assumes a uniform distribution over all entries in the data.")
parser.add_option("--size", dest="size", default=None, type=int,
metavar="X",
help="The number of random lines to return. The default is 1.")
parser.add_option("--exclude", dest="exclude", default=None, type=int,
metavar=0,
help="Exclude the given column (zero-based numbering) from reading.")
parser.add_option("-t", "--field-separator", dest="delimiter",
default=None, type=str, metavar=":",
help="The string used to separate values. By default, any consecutive whitespaces act as delimiter.")
parser.add_option("-T", "--transpose", dest="transpose", default=False,
action="store_true", help="Transpose input")
parser.add_option("--header", dest="header", default=False,
action="store_true", help="First row is considered as header.")
(options, args) = parser.parse_args()
operations = ['mean', 'std', 'se', 'min', 'max', 'median', 'argmin', 'argmax',
'sum', 'hist', 'rand']
data = numpy.genfromtxt(sys.stdin, invalid_raise=False, dtype=str,
delimiter=options.delimiter)
if options.exclude is not None:
data = numpy.delete(data, [options.exclude], axis=1)
if options.header:
data = data[1:]
if options.slice is not None:
slice_size = options.slice
slice_number = int(numpy.ceil(data.shape[0]/float(slice_size)))
if options.nslice is not None:
slice_number = options.nslice
slice_size = int(numpy.ceil(data.shape[0]/float(slice_number)))
if options.nslice is not None or options.slice is not None:
slicer = numpy.repeat(range(slice_number), slice_size)[:data.shape[0]]
data = numpy.c_[slicer, data]
options.group = 0
if options.moving_average is not None:
data = numpy.float_(data)
ws = options.moving_average # window size
ma_array = [] # array to store moving average results
for colid in range(data.shape[1]):
ts = pandas.Series(data[:, colid])
ma_array.append(ts.rolling(window=ws,center=True).mean())
ma_array.append(ts.rolling(window=ws,center=True).std())
ma_array = numpy.asarray(ma_array).T
data = ma_array[ws/2:-ws/2]
prettyprint_array(data)
sys.exit()
if options.transpose:
for row in data.T:
for e in row:
sys.stdout.write('%s\t'%e)
sys.stdout.write('\n')
sys.exit(0)
col_labels = numpy.arange(data.shape[1]) # Keep column ids
if options.group is None:
# No group. Apply the operation to whole dataset
keys = numpy.asarray([0, ])
labels = numpy.zeros(data.shape[0])
col_selection = numpy.ones(data.shape[1], dtype=bool)
else:
keys = numpy.unique(data[:, options.group])
labels = data[:, options.group]
try:
keys = numpy.float_(keys)
labels = numpy.float_(labels)
except ValueError:
pass
if keys.size > 1:
keys.sort()
else:
keys = numpy.asarray([keys])
col_selection = (numpy.arange(data.shape[1]) != options.group)
results = {k: [] for k in operations}
data = numpy.float_(data[:, col_selection])
col_labels = col_labels[col_selection] # Keep track of the column labels
for k in keys:
row_selection = (labels == k)
if options.mean:
results['mean'].append(data[row_selection, :].mean(axis=0))
if options.std:
results['std'].append(data[row_selection, :].std(axis=0))
if options.se:
results['se'].append(data[row_selection, :].std(axis=0)/numpy.sqrt(row_selection.sum()))
if options.min:
results['min'].append(data[row_selection, :].min(axis=0))
if options.max:
results['max'].append(data[row_selection, :].max(axis=0))
if options.median:
results['median'].append(numpy.median(data[row_selection, :], axis=0))
if options.argmin:
results['argmin'].append(data[row_selection, :].argmin(axis=0))
if options.argmax:
results['argmax'].append(data[row_selection, :].argmax(axis=0))
if options.sum:
results['sum'].append(data[row_selection, :].sum(axis=0))
if options.hist is not None:
for row in data[row_selection, :].T:
hist = numpy.histogram(row, bins=options.hist, density=True)
print ' '.join(['%.4g %.4g\n'%_ for _ in zip(hist[1], hist[0])])
results['hist'].append(hist)
if options.rand:
n = data[row_selection, :].shape[0]
if options.prob is None:
p = None
else:
p = data[row_selection, :][:, col_labels==options.prob]
p = numpy.squeeze(p)
p /= p.sum()
randsample = data[row_selection, :][numpy.random.choice(n, p=p, size=options.size, replace=False)]
if options.size is None:
randsample = randsample[None, :]
for randsample_ in randsample:
results['rand'].append(randsample_)
results = {k: numpy.asarray(v) for k,v in results.items()}
prettyprint(keys, results, operations, options.size)
view raw pydatamash.py hosted with ❤ by GitHub
If you want to ask me a question or leave me a message add @bougui505 in your comment.