Fix: improvements and tests for large files

author Silvio Rhatto <rhatto@riseup.net>

Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)

committer Silvio Rhatto <rhatto@riseup.net>

Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)
author Silvio Rhatto <rhatto@riseup.net>
Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)
committer Silvio Rhatto <rhatto@riseup.net>
Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)
diff --git a/Makefile b/Makefile

index 16311a150c91d435a6ad50b4b9ae971267acd63c..1a13c569019c55b321272bd9d15e78941fc43068 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,20 +2,37 @@
  # Makefile for csv-hasher
  #
  
+CHUNKSIZE                                                      = 10000
+CHECK_LINES                                                    = 20
+SAMPLE_ITERATIONS         = 1000
+SAMPLE_ROWS_PER_ITERATION = 1000
+TESTS                                                          = tests
+COLNAME                                                        = id
+SAMPLE                                                         = $(TESTS)/sample.csv
+OUTPUT                                                         = $(TESTS)/output.csv
+
  vendor:
         pipenv install
  
  sample:
-       bin/make-sample 200
+       @#bin/make-sample $(ITERATIONS)
+       pipenv run ./csv-sampler.py --iterations $(SAMPLE_ITERATIONS) --rows_per_iteration $(SAMPLE_ROWS_PER_ITERATION) $(SAMPLE)
  
  test-sample:
-       pipenv run ./csv-hasher.py --chunksize 5 tests/sample.csv tests/output.csv id
+       pipenv run ./csv-hasher.py --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME)
  
  show-test-output:
-       head -20 tests/sample.csv
-       head -20 tests/output.csv
+       head -$(CHECK_LINES) $(SAMPLE)
+       head -$(CHECK_LINES) $(OUTPUT)
+       tail -$(CHECK_LINES) $(SAMPLE)
+       tail -$(CHECK_LINES) $(OUTPUT)
+       wc -l                                                    $(SAMPLE)
+       wc -l                                                    $(OUTPUT)
+       ls -lh                                                   $(TESTS)
  
  clean-sample:
-       rm tests/*.csv
+       rm -f tests/*.csv
+
+clean: clean-sample
  
-test: clean-sample sample test-sample show-test-output clean-sample
+test: clean-sample sample test-sample show-test-output
diff --git a/Pipfile b/Pipfile

index 30be324506384f84eab09a74458aa67ceb38417a..756589a4f6bf17f38e57fd4231ec5d193a9fd4b7 100644 (file)
--- a/Pipfile
+++ b/Pipfile
@@ -7,6 +7,8 @@ name = "pypi"
  pandas = "*"
  tqdm = "*"
  humanfriendly = "*"
+numpy = "*"
+Faker = "*"
  
  [dev-packages]
  
diff --git a/Pipfile.lock b/Pipfile.lock

index cd7b1b2be1741f35124f0e65e2702a7cd06f9e8c..56b7accef73163ab19ab9a8c99a7dda12ce48a44 100644 (file)
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
  {
      "_meta": {
          "hash": {
-            "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e"
+            "sha256": "48063038d08edcb167714b008be15beec84ab053a15d4504cea5d4c7ca6ed321"
          },
          "pipfile-spec": 6,
          "requires": {
@@ -16,6 +16,14 @@
          ]
      },
      "default": {
+        "faker": {
+            "hashes": [
+                "sha256:0783729c61501d52efea2967aff6e6fcb8370f0f6b5a558f2a81233642ae529a",
+                "sha256:6b2995ffff6c2b02bc5daad96f8c24c021e5bd491d9d53d31bcbd66f348181d4"
+            ],
+            "index": "pypi",
+            "version": "==5.8.0"
+        },
          "humanfriendly": {
              "hashes": [
                  "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d",
@@ -61,6 +69,7 @@
                  "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827",
                  "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"
              ],
+            "index": "pypi",
              "version": "==1.19.5"
          },
          "pandas": {
@@ -107,6 +116,13 @@
              ],
              "version": "==1.15.0"
          },
+        "text-unidecode": {
+            "hashes": [
+                "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
+                "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
+            ],
+            "version": "==1.3"
+        },
          "tqdm": {
              "hashes": [
                  "sha256:4621f6823bab46a9cc33d48105753ccbea671b68bab2c50a9f0be23d4065cb5a",
diff --git a/bin/make-sample b/bin/make-sample

index c282a30aa780b55352819a2f9d9a47e0730bdb61..7d405b59b9f0bbdaf2db4e8dd398e9e7ac03e468 100755 (executable)
--- a/bin/make-sample
+++ b/bin/make-sample
@@ -1,6 +1,6 @@
  #!/bin/bash
  #
-# Build a sample dataset.
+# Build a sample dataset, shell script version.
  #
  # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
  #
diff --git a/csv-hasher.py b/csv-hasher.py

index fe206f81e493a5451159c4890920b4aaaed5f7e2..6415c7b55d17a1df3ff43405e751e56cd84b437f 100755 (executable)
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -44,16 +44,19 @@ class CsvHasher:
              print('Invalid hash function ' + self.args.hashfunc)
              exit (1)
  
-    def apply_hash(self, df):
+    def apply_hash(self, df, skip=0):
          """Apply the hash function into a column from a dataframe"""
  
-        return df[self.args.colname[0]].apply(lambda x: \
+        return df[self.args.colname[0]][skip:].apply(lambda x: \
                  getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
  
      def run_legacy(self):
          """
-        Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
-        This won't work with CSVs larger than the available memory in the system.
+        Process CSV in "legacy" mode: open the input file, process and write
+        the output in a single step.
+
+        This won't work with CSVs larger than the available memory in the
+        system.
  
          Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
          Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
@@ -76,26 +79,44 @@ class CsvHasher:
          Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
          """
  
-        infile = self.args.infile[0]
+        # Shorthands
+        infile  = self.args.infile[0]
+        outfile = self.args.outfile[0]
  
          # Get number of lines in the CSV file
          nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
          nlines = int(nlines.split()[0])
  
+        # Check the input file
          if nlines < 2:
              print('CSV file is too small.')
              exit (1)
  
-        # Read the just to get the column names
-        sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
-        sample    = pd.concat(sample_tp, ignore_index=True)
+        # Start with and empty file
+        try:
+            with open(outfile, 'w') as f:
+                f.truncate(0)
+        except IOError:
+            print('Error writing to ' + outfile)
+            exit(1)
+
+        # Holds columns definition
+        columns = None
+
+        # Read a chunk just to get the column names
+        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
+            for chunk in sample:
+                columns = chunk.columns
+                break
  
          # Initialize progress bar
          progress_bar = tqdm(total=nlines) if self.args.progress else False
  
+        # Controls if the header should be included
          write_header = True
  
-        for i in range(0, nlines, self.args.chunksize):
+        # Start iteration from 1 so the CSV header is skipped
+        for i in range(1, nlines, self.args.chunksize):
              df = pd.read_csv(infile,
                      sep=self.args.sep,
                      header=None,               # no header, define column header manually later
@@ -103,7 +124,7 @@ class CsvHasher:
                      skiprows=i)                # skip rows that were already read
  
              # Add column information
-            df.columns = sample.columns
+            df.columns = columns
  
              # Hashing the column
              try:
@@ -113,7 +134,7 @@ class CsvHasher:
                  exit (1)
  
              # Writing the new CSV output
-            df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+            df.to_csv(outfile, index=False, mode='a', header=write_header)
  
              # Write the header only in the first iteration
              write_header = False
@@ -148,7 +169,8 @@ def cmdline():
  
      parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
  
-    parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+    parser.add_argument('--chunksize', dest='chunksize',
+            help='Read chunks at a time, defaults to 1M, supports human-readable notation')
  
      parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
  
diff --git a/csv-sampler.py b/csv-sampler.py

new file mode 100755 (executable)

index 0000000..35d82db
--- /dev/null
+++ b/csv-sampler.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Generate a sample CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import numpy  as np
+import pandas as pd
+from tqdm import tqdm
+
+class CsvSampler:
+    def __init__(self, args):
+        self.args                    = args
+        self.args.iterations         = int(self.args.iterations)
+        self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
+
+    @staticmethod
+    def random_col(size, low=1):
+        return np.random.randint(low, size, size=size)
+
+    # Inspired by
+    # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
+    def write_csv(self, write_header=True, mode='w'):
+        df       = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
+        df['id'] = self.random_col(self.args.rows_per_iteration)
+        df['a']  = self.random_col(self.args.rows_per_iteration)
+        df['b']  = self.random_col(self.args.rows_per_iteration)
+        df['c']  = self.random_col(self.args.rows_per_iteration)
+        df['d']  = self.random_col(self.args.rows_per_iteration)
+
+        df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
+
+    def run(self):
+        progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
+
+        # Write the first portion
+        self.write_csv()
+
+        # Dispatch
+        for i in range(0, self.args.iterations):
+            self.write_csv(write_header=False, mode='a')
+
+            if hasattr(progress_bar, 'update'):
+                progress_bar.update(1)
+
+def cmdline():
+    """
+    Evalutate the command line.
+
+    :return: Command line arguments.
+    """
+
+    basename = os.path.basename(__file__)
+
+    # Parse CLI
+    #examples  = "Examples:\n\t" + basename + " --no-progress \n"
+
+    epilog = ''
+    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+                                     epilog=epilog,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
+
+    parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
+            type=int, help='Rows per iteration, defaults to 1000')
+
+    parser.add_argument('--iterations', dest='iterations',
+            help='Number of iterations, defaults to 1000')
+
+    parser.add_argument('--progress', dest='progress', action='store_true',
+                        help='Enable progress bar.')
+
+    parser.add_argument('--no-progress', dest='progress', action='store_false',
+                        help='Disable progress bar.')
+
+    # Add default values and get args
+    parser.set_defaults(rows_per_iteration=1000)
+    parser.set_defaults(iterations=1000)
+    parser.set_defaults(progress=True)
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args     = cmdline()
+    instance = CsvSampler(args)
+
+    instance.run()
author	Silvio Rhatto <rhatto@riseup.net>
	Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)
committer	Silvio Rhatto <rhatto@riseup.net>
	Fri, 29 Jan 2021 00:41:53 +0000 (21:41 -0300)
Makefile		patch \| blob \| history
Pipfile		patch \| blob \| history
Pipfile.lock		patch \| blob \| history
bin/make-sample		patch \| blob \| history
csv-hasher.py		patch \| blob \| history
csv-sampler.py	[new file with mode: 0755]	patch \| blob