pipenv run ./csv-sampler.py --iterations $(SAMPLE_ITERATIONS) --rows_per_iteration $(SAMPLE_ROWS_PER_ITERATION) $(SAMPLE)
test-sample:
- pipenv run ./csv-hasher.py --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME)
+ pipenv run ./csv-hasher.py --check --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME)
show-test-output:
head -$(CHECK_LINES) $(SAMPLE)
if hasattr(progress_bar, 'close'):
progress_bar.close()
+ def check(self):
+ """Check both files for differences"""
+
+ df_infile = pd.read_csv(self.args.infile[0], sep=self.args.sep)
+ df_outfile = pd.read_csv(self.args.outfile[0], sep=self.args.sep)
+
+ print('Comparing both files without excluding the ' + self.args.colname[0] + ' column:')
+ print(df_infile.compare(df_outfile))
+
+ del df_infile[self.args.colname[0]]
+ del df_outfile[self.args.colname[0]]
+
+ print('Comparing both files excluding the ' + self.args.colname[0] + ' column:')
+ print(df_infile.compare(df_outfile))
+
def cmdline():
"""
Evalutate the command line.
parser.add_argument('--no-progress', dest='progress', action='store_false',
help='Disable progress bar.')
+ parser.add_argument('--check', dest='check', action='store_true',
+ help='Check both files for differences (test suite), defaults to false.')
+
# Add default values and get args
parser.set_defaults(sep=',')
parser.set_defaults(chunksize='1M')
parser.set_defaults(hashfunc='sha256')
parser.set_defaults(progress=True)
+ parser.set_defaults(check=False)
args = parser.parse_args()
return args
instance = CsvHasher(args)
instance.run()
+
+ if args.check == True:
+ instance.check()