Support for a list of known checksums of files already copied over.

For the times when we want to make shorter names of files by doing copies of the documentation files for hyperlink usage, allow input of a new command line option which is a list in the form of: PATH_TO_FILE : sha25sum so that those files can be used rather than new copies made.
author: Bradley M. Kuhn <bkuhn@ebb.org> 2013-01-09 14:51:33 -0500
committer: Bradley M. Kuhn <bkuhn@ebb.org> 2013-02-18 14:08:45 -0500
commit: d13ab6a4026cfeec18fdd989862aecbe83caa20f (patch)
tree: 7fc9ad4ebeacd9fa6c28496973aa7e1f3d001dc6 /contrib/non-profit-audit-reports
parent: 18d2867a6315562b4f4588ebf4fc58adf1fb9acf (diff)
download: fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.tar.gz
fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.tar.bz2
fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.zip
1 files changed, 70 insertions, 9 deletions
diff --git a/contrib/non-profit-audit-reports/csv2ods.py b/contrib/non-profit-audit-reports/csv2ods.py
index 3a3411ba..7dd840c8 100755
--- a/contrib/non-profit-audit-reports/csv2ods.py
+++ b/contrib/non-profit-audit-reports/csv2ods.py
@@ -25,14 +25,46 @@ import csv
 import ooolib2
 import shutil
 import string
+from Crypto.Hash import SHA256
 
 def err(msg):
     print 'error: %s' % msg
     sys.exit(1)
 
-def csv2ods(csvname, odsname, encoding='', singleFileDirectory=None, verbose = False):
+def ReadChecksums(inputFile):
+    checksums = {}
+    with open(inputFile, "r") as inputFH:
+        entries = inputFH.readlines()
+    for ee in entries:
+        fileName, checksum = ee.split(":")
+        fileName = fileName.replace(' ', "")
+        checksum = checksum.replace(' ', "")
+        checksum = checksum.replace("\n", "")
+        checksums[checksum] = fileName
+    return checksums
+
+def ChecksumFile(filename):
+    sha256 = SHA256.new()
+    chunk_size = 8192
+    with open(filename, 'rb') as myFile:
+        while True:
+            chunk = myFile.read(chunk_size)
+            if len(chunk) == 0:
+                break
+            sha256.update(chunk)
+    return sha256.hexdigest()
+
+def main():
+    program = os.path.basename(sys.argv[0])
+
+    print get_file_checksum(sys.argv[1])
+
+def csv2ods(csvname, odsname, encoding='', singleFileDirectory=None, knownChecksums={}, verbose = False):
     filesSavedinManifest = {}
 
+    if knownChecksums:
+        checksumCache = {}
+
     if verbose:
         print 'converting from %s to %s' % (csvname, odsname)
 
@@ -70,10 +102,25 @@ def csv2ods(csvname, odsname, encoding='', singleFileDirectory=None, verbose = F
                     if (len(val) > 0 and val[0:5] == "link:"):
                         val = val[5:]
                         linkname = os.path.basename(val) # name is just the last component
+                        newFile = None
+
                         if not singleFileDirectory:
                             newFile = val
-                        else:
+
+                        if knownChecksums:
+                            if not checksumCache.has_key(val):
+                                checksum = ChecksumFile(val)
+                                checksumCache[val] = checksum
+                            else:
+                                checksum = checksumCache[val]
+
+                            if knownChecksums.has_key(checksum):
+                                newFile = knownChecksums[checksum]
+                                print "FOUND new file in known: " + newFile
+
+                        if not newFile:
                             relativeFileWithPath = os.path.basename(val)
+
                             fileName, fileExtension = os.path.splitext(relativeFileWithPath)
                             newFile = fileName[:15]   # 15 is an arbitrary choice.
                             newFile = newFile + fileExtension
@@ -88,19 +135,24 @@ def csv2ods(csvname, odsname, encoding='', singleFileDirectory=None, verbose = F
                                     if not filesSavedinManifest.has_key(testFile):
                                         break
                                     testFile = None
-                                if not testFile:
-                                    raise Exception("too many similar file names for linkage; giving up")
-                                else:
-                                    newFile = testFile
-                            if not os.path.exists(csvdir + '/' + val):
-                                raise Exception("File" + csvdir + '/' + val + " does not exist in single file directory mode; giving up")
+                                    if not testFile:
+                                        raise Exception("too many similar file names for linkage; giving up")
+                                    else:
+                                        newFile = testFile
+                                        if not os.path.exists(csvdir + '/' + val):
+                                            raise Exception("File" + csvdir + '/' + val + " does not exist in single file directory mode; giving up")
                             src = os.path.join(csvdir, val)
                             dest = os.path.join(csvdir, singleFileDirectory, newFile)
                             shutil.copyfile(src, dest)
                             shutil.copystat(src, dest)
                             shutil.copymode(src, dest)
+
                             newFile = os.path.join(singleFileDirectory, newFile)
 
+                        if knownChecksums:
+                            checksumCache[checksum]   = newFile
+                            knownChecksums[checksum]  = newFile
+
                         linkrel = '../' + newFile # ../ means remove the name of the *.ods
                         doc.set_cell_value(col + 1, row, 'link', (linkrel, linkname))
                         linkpath = csvdir + '/' + val
@@ -149,6 +201,8 @@ def main():
                       help='unicode character encoding type')
     parser.add_option('-d', '--single-file-directory', action='store',
                       help='directory name to move all files into')
+    parser.add_option('-s', '--known-checksum-list', action='store',
+                      help='directory name to move all files into')
     (options, args) = parser.parse_args()
 
     if len(args) != 0:
@@ -163,7 +217,14 @@ def main():
         print 'csv:', options.csv
         print 'ods:', options.ods
         print 'ods:', options.encoding
-    csv2ods(options.csv, options.ods, options.encoding, options.single_file_directory, options.verbose)
+    if options.known_checksum_list and not options.single_file_directory:
+        err(program + ": --known-checksum-list option is completely useless without --single-file-directory")
+    knownChecksums = {}
+    if options.known_checksum_list:
+        if not os.access(options.known_checksum_list, os.R_OK):
+            err(program + ": unable to read file: " + options.known_checksum_list)
+        knownChecksums = ReadChecksums(options.known_checksum_list)
+    csv2ods(options.csv, options.ods, options.encoding, options.single_file_directory, knownChecksums, options.verbose)
 
 if __name__ == '__main__':
   main()
author	Bradley M. Kuhn <bkuhn@ebb.org>	2013-01-09 14:51:33 -0500
committer	Bradley M. Kuhn <bkuhn@ebb.org>	2013-02-18 14:08:45 -0500
commit	d13ab6a4026cfeec18fdd989862aecbe83caa20f (patch)
tree	7fc9ad4ebeacd9fa6c28496973aa7e1f3d001dc6 /contrib/non-profit-audit-reports
parent	18d2867a6315562b4f4588ebf4fc58adf1fb9acf (diff)
download	fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.tar.gz fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.tar.bz2 fork-ledger-d13ab6a4026cfeec18fdd989862aecbe83caa20f.zip