From 26c784e59fff8641141c08a47e193473d71b5e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Mon, 12 Sep 2016 19:37:14 +0200 Subject: [PATCH] Initial commit --- photorec_stage_2.py | 366 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 photorec_stage_2.py diff --git a/photorec_stage_2.py b/photorec_stage_2.py new file mode 100644 index 0000000..99e9d23 --- /dev/null +++ b/photorec_stage_2.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" + Copyright 2016 Grégory Soutadé + + This file is part of Photorec Stage 2. + + Photorec Stage 2 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Photorec Stage 2 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Photorec Stage 2. If not, see . +""" + +import os +from zipfile import ZipFile +import argparse +import shutil +from datetime import datetime + +OOO_CREATION_DATE_START = '' +OOO_CREATION_DATE_END = '' + +MS_CREATION_DATE_START = '' +MS_CREATION_DATE_END = '' + +OPEN_OFFICE_MIME_TYPES = { + 'application/vnd.oasis.opendocument.text' : 'odt', + 'application/vnd.oasis.opendocument.spreadsheet' : 'ods', + 'application/vnd.oasis.opendocument.presentation' : 'odp', + } + +statistics = {} + +def remove_extension(filename): + point_pos = filename.rfind('.') + + if point_pos == -1: + return (filename, '') + + extension = filename[point_pos+1:] + filename = filename[:point_pos] + + return filename, extension + +def copy_file(orig_filepath, filename, outdir, verbose, extension=None): + if not os.path.exists(outdir): + if verbose: + print('mkdirs %s' % (outdir)) + os.makedirs(outdir) + + if not extension: + filename, extension = remove_extension(filename) + + if os.path.exists(os.path.join(outdir, '%s.%s' % (filename, extension))): + cur_num = 2 + while os.path.exists(os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))): + cur_num = cur_num + 1 + filename = os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension)) + else: + filename = os.path.join(outdir, '%s.%s' % (filename, extension)) + + if verbose: + print('\tCopy %s => %s' % (orig_filepath, filename)) + + statistics[extension] = statistics.get(extension, 0) + 1 + + shutil.copy(orig_filepath, filename) + +def copy_datetime_file(dt, orig_filepath, outdir, verbose, extension): + outdir = os.path.join(outdir, str(dt.year), '%02d' % dt.month) + filename = '%d_%02d_%02d' % (dt.year, dt.month, dt.day) + return copy_file(orig_filepath, filename, outdir, verbose, extension) + +def _try_open_office(orig_filepath, zipfile, zipname, filename, outdir, verbose): + with zipfile.open(zipname) as mimetype: + mime = mimetype.read() + if not mime in OPEN_OFFICE_MIME_TYPES.keys(): return False + + ext = OPEN_OFFICE_MIME_TYPES[mime] + + if verbose: + print('Found %s file' % (ext)) + + try: + meta = zipfile.open('meta.xml') + except KeyError: + return False + + outdir = os.path.join(outdir, 'office') + + metadata = meta.read() + try: + start = metadata.index(OOO_CREATION_DATE_START) + end = metadata.index(OOO_CREATION_DATE_END) + + creation_date = datetime.strptime(metadata[start+len(OOO_CREATION_DATE_START):end], '%Y-%m-%dT%H:%M:%S') + copy_datetime_file(creation_date, orig_filepath, outdir, verbose, ext) + except: + copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, ext) + + meta.close() + return True + return False + +def _try_ms_office(orig_filepath, zipfile, filename, outdir, verbose, extension): + try: + meta = zipfile.open('docProps/core.xml') + except KeyError: + return False + + outdir = os.path.join(outdir, 'office') + + metadata = meta.read() + try: + start = metadata.index(MS_CREATION_DATE_START) + end = metadata.index(MS_CREATION_DATE_END) + + creation_date = datetime.strptime(metadata[start+len(MS_CREATION_DATE_START):end][:19], '%Y-%m-%dT%H:%M:%S') + copy_datetime_file(creation_date, orig_filepath, outdir, verbose, extension) + except: + copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, extension) + + meta.close() + + return True + +def manage_zip(orig_filepath, filename, extension, outdir, verbose=False): + ret = False + with ZipFile(orig_filepath, 'r') as myzip: + for name in myzip.namelist(): + # Maybe an Open Document file + if name == 'mimetype': + ret = _try_open_office(orig_filepath, myzip, name, filename, outdir, verbose) + if ret: return ret + if name.startswith('word/'): + ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'docx') + if ret: return ret + if name.startswith('xl/'): + ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'xslx') + if ret: return ret + if name.startswith('ppt/'): + ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'pptx') + if ret: return ret + + return ret + +def _escape_name(name): + name = name.replace('/', '') + name = name.replace('\\', '') + + return name + +def _append_tag(filename, tag): + if not tag: return filename + + if filename: + filename = '%s - ' % filename + + if type(tag) == int: + filename ='%s%d' % (filename, tag) + else: + filename ='%s%s' % (filename, _escape_name(tag)) + + return filename + +def manage_audio(orig_filepath, filename, extension, outdir, verbose=False): + tags = None + try: + tags = eyed3.load(orig_filepath).tag + except: + return False + + if not tags or (not tags.artist and not tags.album and not tags.title): + return False + + outdir = os.path.join(outdir, 'multimedia', 'audio') + + new_filename = '' + if tags.track_num[0]: new_filename = _append_tag('', tags.track_num[0]) + new_filename = _append_tag(new_filename, tags.title) + new_filename = '%s.%s' % (new_filename, extension) + + subdir = False + if tags.artist: + outdir = os.path.join(outdir, _escape_name(tags.artist)) + subdir = True + if tags.album: + outdir = os.path.join(outdir, _escape_name(tags.album)) + subdir = True + + if not subdir: + outdir = os.path.join(outdir, 'unknown') + + copy_file(orig_filepath, new_filename, outdir, verbose) + + return True + +def manage_picture(orig_filepath, filename, extension, outdir, verbose=False): + tags = None + + with open(orig_filepath, 'rb') as f: + try: + tags = exifread.process_file(f) + except: + return False + + outdir = os.path.join(outdir, 'multimedia', 'pictures') + + dt = None + try: + dt = datetime.strptime(tags['EXIF DateTimeOriginal'].values, '%Y:%m:%d %H:%M:%S') + except KeyError, e: # No 'EXIF DateTimeOriginal' + return False + except: + print 'Invalid date format \'%s\'' % tags['EXIF DateTimeOriginal'] + return False + + copy_datetime_file(dt, orig_filepath, outdir, verbose, extension) + return True + +_multiplier_bytes = (1, 1024, 1024*1024, 1024*1024*1024) +_multiplier_bits = (1, 1000, 1000*1000, 1000*1000*1000) + +def parse_min_sizes(min_sizes_comma): + if not min_sizes_comma: return {} + min_sizes = {} + for element in min_sizes_comma.split(','): + sub_elements = element.split(':') + if len(sub_elements) != 2: + raise Exception('Invalid parameter --min-size \'%s\'' % (element)) + extension = sub_elements[0] + size = sub_elements[1] + multiplier_idx = 0 + multiplier_tab = _multiplier_bytes + if size[-1:] == 'b' or size[-1:] == 'B': + size = size[:-1] + if size[-1:] == 'i': + size = size[:-1] + multiplier_tab = _multiplier_bits + if size[-1:] == 'k' or size[-1:] == 'K': + multiplier_idx = 1 + size = size[:-1] + elif size[-1:] == 'm' or size[-1:] == 'M': + multiplier_idx = 2 + size = size[:-1] + elif size[-1:] == 'g' or size[-1:] == 'G': + multiplier_idx = 2 + size = size[:-1] + + try: + size = int(size) + except: + raise Exception('Invalid parameter --min-size \'%s\'' % (element)) + + min_sizes[extension] = size * multiplier_tab[multiplier_idx] + + return min_sizes + +parser = argparse.ArgumentParser(description='Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg).') + +parser.add_argument('--in', dest='in_dir', help='Directory in (with photorec results)', required=True) +parser.add_argument('--out', dest='out_dir', help='Directory out (script results)', required=True) +parser.add_argument('--max-files-per-temp', dest='max_files_per_temp', + help='Maximum unknown files in temprorary directory, -1 for no temp dir', + type=int, default=50) +parser.add_argument('--skip-ext', dest='skip_ext', help='Don\'t copy some extensions (comma separated eg : mp3,txt,doc)', default='') +parser.add_argument('--only-ext', dest='only_ext', help='Copy some extensions (comma separated eg : mp3,txt,doc)', default='') +parser.add_argument('--min-size', dest='min_size', help='Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8)', default='') +parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode') +parser.add_argument('--quiet', dest='quiet', action='store_true', default=False, help='Quiet mode') + +args = parser.parse_args() + +file_ops = { + 'zip': manage_zip, +} + +try: + import eyed3 + file_ops['mp3'] = manage_audio +except: + print('Package eyed3 not installed, mp3 format not supported. Use pip install eyed3') + +try: + import exifread + file_ops['jpg'] = manage_picture +except: + print('Package exifread not installed, jpg format not supported. Use pip install exifread') + +file_ops_keys = file_ops.keys() + +cur_out_dir = 0 +cur_files_in_out_dir = 0 +skip_exts = args.skip_ext and args.skip_ext.split(',') or None +only_exts = args.only_ext and args.only_ext.split(',') or None + +min_sizes = parse_min_sizes(args.min_size) +min_sizes_keys = min_sizes.keys() + +# Disable (force) verbose on quiet +if args.quiet: args.verbose = False + +if args.max_files_per_temp == -1: + outdir = args.out_dir + +for root, dirs, files in os.walk(args.in_dir): + for filename in files: + + full_path = os.path.join(root, filename) + _, cur_extension = remove_extension(filename) + + # Only some extensions + if only_exts and cur_extension not in only_exts: + if args.verbose: + print('Skipping %s (only extension)' % (full_path)) + continue + + # Skipping some extensions + if skip_exts and cur_extension in skip_exts: + if args.verbose: + print('Skipping %s (skip extension)' % (full_path)) + continue + + # Min sizes + if min_sizes and cur_extension in min_sizes_keys: + statinfo = os.stat(full_path) + if statinfo.st_size < min_sizes[cur_extension]: + if args.verbose: + print('Skipping %s (min size)' % (full_path)) + continue + + # Filtered files + if cur_extension in file_ops_keys: + if args.verbose: + print('Filter \'%s\'' % (full_path)) + ret = file_ops[cur_extension](full_path, filename, cur_extension, + args.out_dir, args.verbose) + + if ret: continue + + # Simple copy + if args.max_files_per_temp != -1: + if cur_files_in_out_dir == args.max_files_per_temp: + cur_files_in_out_dir = 0 + cur_out_dir = cur_out_dir + 1 + outdir = os.path.join(args.out_dir, str(cur_out_dir)) + cur_files_in_out_dir = cur_files_in_out_dir + 1 + + if args.verbose: + print('Std copy %s => %s' % (full_path, outdir)) + + copy_file(full_path, filename, outdir, verbose=False) + +if not args.quiet: + print('Statistics :\n') + for key in sorted(statistics.keys()): + print('\t.%s\t=> %d' % (key, statistics[key]))