{
"_meta": {
"hash": {
- "sha256": "19ab6829f09294559ac6466b24082f8537cb5c7be2d6aec8bbe7b18814d3d587"
+ "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e"
},
"pipfile-spec": 6,
"requires": {
]
},
"default": {
+ "humanfriendly": {
+ "hashes": [
+ "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d",
+ "sha256:d5c731705114b9ad673754f3317d9fa4c23212f36b29bdc4272a892eafc9bc72"
+ ],
+ "index": "pypi",
+ "version": "==9.1"
+ },
"numpy": {
"hashes": [
"sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94",
import pandas as pd
import hashlib
import subprocess
+import humanfriendly
from sys import exit
from tqdm import tqdm
def __init__(self, args):
# Save arguments
- self.args = args
+ self.args = args
+ self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))
# Check if source file exists
if not os.path.exists(args.infile[0]):
exit (1)
def apply_hash(self, df):
+
return df[self.args.colname[0]].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
- parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000')
+ parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
# Add default values and get args
parser.set_defaults(sep=',')
- parser.set_defaults(chunksize=1000)
+ parser.set_defaults(chunksize='1M')
parser.set_defaults(hashfunc='sha256')
parser.set_defaults(progress=True)
args = parser.parse_args()