#!/usr/bin/env python # -*- coding: utf-8 -*- """ Copyright 2016 Grégory Soutadé This file is part of Photorec Stage 2. Photorec Stage 2 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Photorec Stage 2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Photorec Stage 2. If not, see . """ import os from zipfile import ZipFile import argparse import shutil from datetime import datetime OOO_CREATION_DATE_START = '' OOO_CREATION_DATE_END = '' MS_CREATION_DATE_START = '' MS_CREATION_DATE_END = '' OPEN_OFFICE_MIME_TYPES = { 'application/vnd.oasis.opendocument.text' : 'odt', 'application/vnd.oasis.opendocument.spreadsheet' : 'ods', 'application/vnd.oasis.opendocument.presentation' : 'odp', } statistics = {} def remove_extension(filename): point_pos = filename.rfind('.') if point_pos == -1: return (filename, '') extension = filename[point_pos+1:] filename = filename[:point_pos] return filename, extension def copy_file(orig_filepath, filename, outdir, verbose, extension=None): if not os.path.exists(outdir): if verbose: print('mkdirs %s' % (outdir)) os.makedirs(outdir) if not extension: filename, extension = remove_extension(filename) if os.path.exists(os.path.join(outdir, '%s.%s' % (filename, extension))): cur_num = 2 while os.path.exists(os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))): cur_num = cur_num + 1 filename = os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension)) else: filename = os.path.join(outdir, '%s.%s' % (filename, extension)) if verbose: print('\tCopy %s => %s' % (orig_filepath, filename)) statistics[extension] = statistics.get(extension, 0) + 1 shutil.copy(orig_filepath, filename) def copy_datetime_file(dt, orig_filepath, outdir, verbose, extension): outdir = os.path.join(outdir, str(dt.year), '%02d' % dt.month) filename = '%d_%02d_%02d' % (dt.year, dt.month, dt.day) return copy_file(orig_filepath, filename, outdir, verbose, extension) def _try_open_office(orig_filepath, zipfile, zipname, filename, outdir, verbose): with zipfile.open(zipname) as mimetype: mime = mimetype.read() if not mime in OPEN_OFFICE_MIME_TYPES.keys(): return False ext = OPEN_OFFICE_MIME_TYPES[mime] if verbose: print('Found %s file' % (ext)) try: meta = zipfile.open('meta.xml') except KeyError: return False outdir = os.path.join(outdir, 'office') metadata = meta.read() try: start = metadata.index(OOO_CREATION_DATE_START) end = metadata.index(OOO_CREATION_DATE_END) creation_date = datetime.strptime(metadata[start+len(OOO_CREATION_DATE_START):end], '%Y-%m-%dT%H:%M:%S') copy_datetime_file(creation_date, orig_filepath, outdir, verbose, ext) except: copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, ext) meta.close() return True return False def _try_ms_office(orig_filepath, zipfile, filename, outdir, verbose, extension): try: meta = zipfile.open('docProps/core.xml') except KeyError: return False outdir = os.path.join(outdir, 'office') metadata = meta.read() try: start = metadata.index(MS_CREATION_DATE_START) end = metadata.index(MS_CREATION_DATE_END) creation_date = datetime.strptime(metadata[start+len(MS_CREATION_DATE_START):end][:19], '%Y-%m-%dT%H:%M:%S') copy_datetime_file(creation_date, orig_filepath, outdir, verbose, extension) except: copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, extension) meta.close() return True def manage_zip(orig_filepath, filename, extension, outdir, verbose=False): ret = False with ZipFile(orig_filepath, 'r') as myzip: for name in myzip.namelist(): # Maybe an Open Document file if name == 'mimetype': ret = _try_open_office(orig_filepath, myzip, name, filename, outdir, verbose) if ret: return ret if name.startswith('word/'): ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'docx') if ret: return ret if name.startswith('xl/'): ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'xslx') if ret: return ret if name.startswith('ppt/'): ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'pptx') if ret: return ret return ret def _escape_name(name): name = name.replace('/', '') name = name.replace('\\', '') return name def _append_tag(filename, tag): if not tag: return filename if filename: filename = '%s - ' % filename if type(tag) == int: filename ='%s%d' % (filename, tag) else: filename ='%s%s' % (filename, _escape_name(tag)) return filename def manage_audio(orig_filepath, filename, extension, outdir, verbose=False): tags = None try: tags = eyed3.load(orig_filepath).tag except: return False if not tags or (not tags.artist and not tags.album and not tags.title): return False outdir = os.path.join(outdir, 'multimedia', 'audio') new_filename = '' if tags.track_num[0]: new_filename = _append_tag('', tags.track_num[0]) new_filename = _append_tag(new_filename, tags.title) new_filename = '%s.%s' % (new_filename, extension) subdir = False if tags.artist: outdir = os.path.join(outdir, _escape_name(tags.artist)) subdir = True if tags.album: outdir = os.path.join(outdir, _escape_name(tags.album)) subdir = True if not subdir: outdir = os.path.join(outdir, 'unknown') copy_file(orig_filepath, new_filename, outdir, verbose) return True def manage_picture(orig_filepath, filename, extension, outdir, verbose=False): tags = None with open(orig_filepath, 'rb') as f: try: tags = exifread.process_file(f) except: return False outdir = os.path.join(outdir, 'multimedia', 'pictures') dt = None try: dt = datetime.strptime(tags['EXIF DateTimeOriginal'].values, '%Y:%m:%d %H:%M:%S') except KeyError, e: # No 'EXIF DateTimeOriginal' return False except: print 'Invalid date format \'%s\'' % tags['EXIF DateTimeOriginal'] return False copy_datetime_file(dt, orig_filepath, outdir, verbose, extension) return True _multiplier_bytes = (1, 1024, 1024*1024, 1024*1024*1024) _multiplier_bits = (1, 1000, 1000*1000, 1000*1000*1000) def parse_min_sizes(min_sizes_comma): if not min_sizes_comma: return {} min_sizes = {} for element in min_sizes_comma.split(','): sub_elements = element.split(':') if len(sub_elements) != 2: raise Exception('Invalid parameter --min-size \'%s\'' % (element)) extension = sub_elements[0] size = sub_elements[1] multiplier_idx = 0 multiplier_tab = _multiplier_bytes if size[-1:] == 'b' or size[-1:] == 'B': size = size[:-1] if size[-1:] == 'i': size = size[:-1] multiplier_tab = _multiplier_bits if size[-1:] == 'k' or size[-1:] == 'K': multiplier_idx = 1 size = size[:-1] elif size[-1:] == 'm' or size[-1:] == 'M': multiplier_idx = 2 size = size[:-1] elif size[-1:] == 'g' or size[-1:] == 'G': multiplier_idx = 2 size = size[:-1] try: size = int(size) except: raise Exception('Invalid parameter --min-size \'%s\'' % (element)) min_sizes[extension] = size * multiplier_tab[multiplier_idx] return min_sizes parser = argparse.ArgumentParser(description='Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg).') parser.add_argument('--in', dest='in_dir', help='Directory in (with photorec results)', required=True) parser.add_argument('--out', dest='out_dir', help='Directory out (script results)', required=True) parser.add_argument('--max-files-per-temp', dest='max_files_per_temp', help='Maximum unknown files in temprorary directory, -1 for no temp dir', type=int, default=50) parser.add_argument('--skip-ext', dest='skip_ext', help='Don\'t copy some extensions (comma separated eg : mp3,txt,doc)', default='') parser.add_argument('--only-ext', dest='only_ext', help='Copy some extensions (comma separated eg : mp3,txt,doc)', default='') parser.add_argument('--min-size', dest='min_size', help='Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8)', default='') parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode') parser.add_argument('--quiet', dest='quiet', action='store_true', default=False, help='Quiet mode') args = parser.parse_args() file_ops = { 'zip': manage_zip, } try: import eyed3 file_ops['mp3'] = manage_audio except: print('Package eyed3 not installed, mp3 format not supported. Use pip install eyed3') try: import exifread file_ops['jpg'] = manage_picture except: print('Package exifread not installed, jpg format not supported. Use pip install exifread') file_ops_keys = file_ops.keys() cur_out_dir = 0 cur_files_in_out_dir = 0 skip_exts = args.skip_ext and args.skip_ext.split(',') or None only_exts = args.only_ext and args.only_ext.split(',') or None min_sizes = parse_min_sizes(args.min_size) min_sizes_keys = min_sizes.keys() # Disable (force) verbose on quiet if args.quiet: args.verbose = False if args.max_files_per_temp == -1: outdir = args.out_dir for root, dirs, files in os.walk(args.in_dir): for filename in files: full_path = os.path.join(root, filename) _, cur_extension = remove_extension(filename) # Only some extensions if only_exts and cur_extension not in only_exts: if args.verbose: print('Skipping %s (only extension)' % (full_path)) continue # Skipping some extensions if skip_exts and cur_extension in skip_exts: if args.verbose: print('Skipping %s (skip extension)' % (full_path)) continue # Min sizes if min_sizes and cur_extension in min_sizes_keys: statinfo = os.stat(full_path) if statinfo.st_size < min_sizes[cur_extension]: if args.verbose: print('Skipping %s (min size)' % (full_path)) continue # Filtered files if cur_extension in file_ops_keys: if args.verbose: print('Filter \'%s\'' % (full_path)) ret = file_ops[cur_extension](full_path, filename, cur_extension, args.out_dir, args.verbose) if ret: continue # Simple copy if args.max_files_per_temp != -1: if cur_files_in_out_dir == args.max_files_per_temp: cur_files_in_out_dir = 0 cur_out_dir = cur_out_dir + 1 outdir = os.path.join(args.out_dir, str(cur_out_dir)) cur_files_in_out_dir = cur_files_in_out_dir + 1 if args.verbose: print('Std copy %s => %s' % (full_path, outdir)) copy_file(full_path, filename, outdir, verbose=False) if not args.quiet: print('Statistics :\n') for key in sorted(statistics.keys()): print('\t.%s\t=> %d' % (key, statistics[key]))