Initial commit
This commit is contained in:
commit
26c784e59f
366
photorec_stage_2.py
Normal file
366
photorec_stage_2.py
Normal file
|
@ -0,0 +1,366 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Copyright 2016 Grégory Soutadé
|
||||
|
||||
This file is part of Photorec Stage 2.
|
||||
|
||||
Photorec Stage 2 is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Photorec Stage 2 is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Photorec Stage 2. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import os
|
||||
from zipfile import ZipFile
|
||||
import argparse
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
|
||||
OOO_CREATION_DATE_START = '<meta:creation-date>'
|
||||
OOO_CREATION_DATE_END = '</meta:creation-date>'
|
||||
|
||||
MS_CREATION_DATE_START = '<dcterms:created xsi:type="dcterms:W3CDTF">'
|
||||
MS_CREATION_DATE_END = '</dcterms:created>'
|
||||
|
||||
OPEN_OFFICE_MIME_TYPES = {
|
||||
'application/vnd.oasis.opendocument.text' : 'odt',
|
||||
'application/vnd.oasis.opendocument.spreadsheet' : 'ods',
|
||||
'application/vnd.oasis.opendocument.presentation' : 'odp',
|
||||
}
|
||||
|
||||
statistics = {}
|
||||
|
||||
def remove_extension(filename):
|
||||
point_pos = filename.rfind('.')
|
||||
|
||||
if point_pos == -1:
|
||||
return (filename, '')
|
||||
|
||||
extension = filename[point_pos+1:]
|
||||
filename = filename[:point_pos]
|
||||
|
||||
return filename, extension
|
||||
|
||||
def copy_file(orig_filepath, filename, outdir, verbose, extension=None):
|
||||
if not os.path.exists(outdir):
|
||||
if verbose:
|
||||
print('mkdirs %s' % (outdir))
|
||||
os.makedirs(outdir)
|
||||
|
||||
if not extension:
|
||||
filename, extension = remove_extension(filename)
|
||||
|
||||
if os.path.exists(os.path.join(outdir, '%s.%s' % (filename, extension))):
|
||||
cur_num = 2
|
||||
while os.path.exists(os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))):
|
||||
cur_num = cur_num + 1
|
||||
filename = os.path.join(outdir, '%s_%d.%s' % (filename, cur_num, extension))
|
||||
else:
|
||||
filename = os.path.join(outdir, '%s.%s' % (filename, extension))
|
||||
|
||||
if verbose:
|
||||
print('\tCopy %s => %s' % (orig_filepath, filename))
|
||||
|
||||
statistics[extension] = statistics.get(extension, 0) + 1
|
||||
|
||||
shutil.copy(orig_filepath, filename)
|
||||
|
||||
def copy_datetime_file(dt, orig_filepath, outdir, verbose, extension):
|
||||
outdir = os.path.join(outdir, str(dt.year), '%02d' % dt.month)
|
||||
filename = '%d_%02d_%02d' % (dt.year, dt.month, dt.day)
|
||||
return copy_file(orig_filepath, filename, outdir, verbose, extension)
|
||||
|
||||
def _try_open_office(orig_filepath, zipfile, zipname, filename, outdir, verbose):
|
||||
with zipfile.open(zipname) as mimetype:
|
||||
mime = mimetype.read()
|
||||
if not mime in OPEN_OFFICE_MIME_TYPES.keys(): return False
|
||||
|
||||
ext = OPEN_OFFICE_MIME_TYPES[mime]
|
||||
|
||||
if verbose:
|
||||
print('Found %s file' % (ext))
|
||||
|
||||
try:
|
||||
meta = zipfile.open('meta.xml')
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
outdir = os.path.join(outdir, 'office')
|
||||
|
||||
metadata = meta.read()
|
||||
try:
|
||||
start = metadata.index(OOO_CREATION_DATE_START)
|
||||
end = metadata.index(OOO_CREATION_DATE_END)
|
||||
|
||||
creation_date = datetime.strptime(metadata[start+len(OOO_CREATION_DATE_START):end], '%Y-%m-%dT%H:%M:%S')
|
||||
copy_datetime_file(creation_date, orig_filepath, outdir, verbose, ext)
|
||||
except:
|
||||
copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, ext)
|
||||
|
||||
meta.close()
|
||||
return True
|
||||
return False
|
||||
|
||||
def _try_ms_office(orig_filepath, zipfile, filename, outdir, verbose, extension):
|
||||
try:
|
||||
meta = zipfile.open('docProps/core.xml')
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
outdir = os.path.join(outdir, 'office')
|
||||
|
||||
metadata = meta.read()
|
||||
try:
|
||||
start = metadata.index(MS_CREATION_DATE_START)
|
||||
end = metadata.index(MS_CREATION_DATE_END)
|
||||
|
||||
creation_date = datetime.strptime(metadata[start+len(MS_CREATION_DATE_START):end][:19], '%Y-%m-%dT%H:%M:%S')
|
||||
copy_datetime_file(creation_date, orig_filepath, outdir, verbose, extension)
|
||||
except:
|
||||
copy_file(orig_filepath, remove_extension(filename)[0], outdir, verbose, extension)
|
||||
|
||||
meta.close()
|
||||
|
||||
return True
|
||||
|
||||
def manage_zip(orig_filepath, filename, extension, outdir, verbose=False):
|
||||
ret = False
|
||||
with ZipFile(orig_filepath, 'r') as myzip:
|
||||
for name in myzip.namelist():
|
||||
# Maybe an Open Document file
|
||||
if name == 'mimetype':
|
||||
ret = _try_open_office(orig_filepath, myzip, name, filename, outdir, verbose)
|
||||
if ret: return ret
|
||||
if name.startswith('word/'):
|
||||
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'docx')
|
||||
if ret: return ret
|
||||
if name.startswith('xl/'):
|
||||
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'xslx')
|
||||
if ret: return ret
|
||||
if name.startswith('ppt/'):
|
||||
ret = _try_ms_office(orig_filepath, myzip, filename, outdir, verbose, 'pptx')
|
||||
if ret: return ret
|
||||
|
||||
return ret
|
||||
|
||||
def _escape_name(name):
|
||||
name = name.replace('/', '')
|
||||
name = name.replace('\\', '')
|
||||
|
||||
return name
|
||||
|
||||
def _append_tag(filename, tag):
|
||||
if not tag: return filename
|
||||
|
||||
if filename:
|
||||
filename = '%s - ' % filename
|
||||
|
||||
if type(tag) == int:
|
||||
filename ='%s%d' % (filename, tag)
|
||||
else:
|
||||
filename ='%s%s' % (filename, _escape_name(tag))
|
||||
|
||||
return filename
|
||||
|
||||
def manage_audio(orig_filepath, filename, extension, outdir, verbose=False):
|
||||
tags = None
|
||||
try:
|
||||
tags = eyed3.load(orig_filepath).tag
|
||||
except:
|
||||
return False
|
||||
|
||||
if not tags or (not tags.artist and not tags.album and not tags.title):
|
||||
return False
|
||||
|
||||
outdir = os.path.join(outdir, 'multimedia', 'audio')
|
||||
|
||||
new_filename = ''
|
||||
if tags.track_num[0]: new_filename = _append_tag('', tags.track_num[0])
|
||||
new_filename = _append_tag(new_filename, tags.title)
|
||||
new_filename = '%s.%s' % (new_filename, extension)
|
||||
|
||||
subdir = False
|
||||
if tags.artist:
|
||||
outdir = os.path.join(outdir, _escape_name(tags.artist))
|
||||
subdir = True
|
||||
if tags.album:
|
||||
outdir = os.path.join(outdir, _escape_name(tags.album))
|
||||
subdir = True
|
||||
|
||||
if not subdir:
|
||||
outdir = os.path.join(outdir, 'unknown')
|
||||
|
||||
copy_file(orig_filepath, new_filename, outdir, verbose)
|
||||
|
||||
return True
|
||||
|
||||
def manage_picture(orig_filepath, filename, extension, outdir, verbose=False):
|
||||
tags = None
|
||||
|
||||
with open(orig_filepath, 'rb') as f:
|
||||
try:
|
||||
tags = exifread.process_file(f)
|
||||
except:
|
||||
return False
|
||||
|
||||
outdir = os.path.join(outdir, 'multimedia', 'pictures')
|
||||
|
||||
dt = None
|
||||
try:
|
||||
dt = datetime.strptime(tags['EXIF DateTimeOriginal'].values, '%Y:%m:%d %H:%M:%S')
|
||||
except KeyError, e: # No 'EXIF DateTimeOriginal'
|
||||
return False
|
||||
except:
|
||||
print 'Invalid date format \'%s\'' % tags['EXIF DateTimeOriginal']
|
||||
return False
|
||||
|
||||
copy_datetime_file(dt, orig_filepath, outdir, verbose, extension)
|
||||
return True
|
||||
|
||||
_multiplier_bytes = (1, 1024, 1024*1024, 1024*1024*1024)
|
||||
_multiplier_bits = (1, 1000, 1000*1000, 1000*1000*1000)
|
||||
|
||||
def parse_min_sizes(min_sizes_comma):
|
||||
if not min_sizes_comma: return {}
|
||||
min_sizes = {}
|
||||
for element in min_sizes_comma.split(','):
|
||||
sub_elements = element.split(':')
|
||||
if len(sub_elements) != 2:
|
||||
raise Exception('Invalid parameter --min-size \'%s\'' % (element))
|
||||
extension = sub_elements[0]
|
||||
size = sub_elements[1]
|
||||
multiplier_idx = 0
|
||||
multiplier_tab = _multiplier_bytes
|
||||
if size[-1:] == 'b' or size[-1:] == 'B':
|
||||
size = size[:-1]
|
||||
if size[-1:] == 'i':
|
||||
size = size[:-1]
|
||||
multiplier_tab = _multiplier_bits
|
||||
if size[-1:] == 'k' or size[-1:] == 'K':
|
||||
multiplier_idx = 1
|
||||
size = size[:-1]
|
||||
elif size[-1:] == 'm' or size[-1:] == 'M':
|
||||
multiplier_idx = 2
|
||||
size = size[:-1]
|
||||
elif size[-1:] == 'g' or size[-1:] == 'G':
|
||||
multiplier_idx = 2
|
||||
size = size[:-1]
|
||||
|
||||
try:
|
||||
size = int(size)
|
||||
except:
|
||||
raise Exception('Invalid parameter --min-size \'%s\'' % (element))
|
||||
|
||||
min_sizes[extension] = size * multiplier_tab[multiplier_idx]
|
||||
|
||||
return min_sizes
|
||||
|
||||
parser = argparse.ArgumentParser(description='Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg).')
|
||||
|
||||
parser.add_argument('--in', dest='in_dir', help='Directory in (with photorec results)', required=True)
|
||||
parser.add_argument('--out', dest='out_dir', help='Directory out (script results)', required=True)
|
||||
parser.add_argument('--max-files-per-temp', dest='max_files_per_temp',
|
||||
help='Maximum unknown files in temprorary directory, -1 for no temp dir',
|
||||
type=int, default=50)
|
||||
parser.add_argument('--skip-ext', dest='skip_ext', help='Don\'t copy some extensions (comma separated eg : mp3,txt,doc)', default='')
|
||||
parser.add_argument('--only-ext', dest='only_ext', help='Copy some extensions (comma separated eg : mp3,txt,doc)', default='')
|
||||
parser.add_argument('--min-size', dest='min_size', help='Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8)', default='')
|
||||
parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode')
|
||||
parser.add_argument('--quiet', dest='quiet', action='store_true', default=False, help='Quiet mode')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
file_ops = {
|
||||
'zip': manage_zip,
|
||||
}
|
||||
|
||||
try:
|
||||
import eyed3
|
||||
file_ops['mp3'] = manage_audio
|
||||
except:
|
||||
print('Package eyed3 not installed, mp3 format not supported. Use pip install eyed3')
|
||||
|
||||
try:
|
||||
import exifread
|
||||
file_ops['jpg'] = manage_picture
|
||||
except:
|
||||
print('Package exifread not installed, jpg format not supported. Use pip install exifread')
|
||||
|
||||
file_ops_keys = file_ops.keys()
|
||||
|
||||
cur_out_dir = 0
|
||||
cur_files_in_out_dir = 0
|
||||
skip_exts = args.skip_ext and args.skip_ext.split(',') or None
|
||||
only_exts = args.only_ext and args.only_ext.split(',') or None
|
||||
|
||||
min_sizes = parse_min_sizes(args.min_size)
|
||||
min_sizes_keys = min_sizes.keys()
|
||||
|
||||
# Disable (force) verbose on quiet
|
||||
if args.quiet: args.verbose = False
|
||||
|
||||
if args.max_files_per_temp == -1:
|
||||
outdir = args.out_dir
|
||||
|
||||
for root, dirs, files in os.walk(args.in_dir):
|
||||
for filename in files:
|
||||
|
||||
full_path = os.path.join(root, filename)
|
||||
_, cur_extension = remove_extension(filename)
|
||||
|
||||
# Only some extensions
|
||||
if only_exts and cur_extension not in only_exts:
|
||||
if args.verbose:
|
||||
print('Skipping %s (only extension)' % (full_path))
|
||||
continue
|
||||
|
||||
# Skipping some extensions
|
||||
if skip_exts and cur_extension in skip_exts:
|
||||
if args.verbose:
|
||||
print('Skipping %s (skip extension)' % (full_path))
|
||||
continue
|
||||
|
||||
# Min sizes
|
||||
if min_sizes and cur_extension in min_sizes_keys:
|
||||
statinfo = os.stat(full_path)
|
||||
if statinfo.st_size < min_sizes[cur_extension]:
|
||||
if args.verbose:
|
||||
print('Skipping %s (min size)' % (full_path))
|
||||
continue
|
||||
|
||||
# Filtered files
|
||||
if cur_extension in file_ops_keys:
|
||||
if args.verbose:
|
||||
print('Filter \'%s\'' % (full_path))
|
||||
ret = file_ops[cur_extension](full_path, filename, cur_extension,
|
||||
args.out_dir, args.verbose)
|
||||
|
||||
if ret: continue
|
||||
|
||||
# Simple copy
|
||||
if args.max_files_per_temp != -1:
|
||||
if cur_files_in_out_dir == args.max_files_per_temp:
|
||||
cur_files_in_out_dir = 0
|
||||
cur_out_dir = cur_out_dir + 1
|
||||
outdir = os.path.join(args.out_dir, str(cur_out_dir))
|
||||
cur_files_in_out_dir = cur_files_in_out_dir + 1
|
||||
|
||||
if args.verbose:
|
||||
print('Std copy %s => %s' % (full_path, outdir))
|
||||
|
||||
copy_file(full_path, filename, outdir, verbose=False)
|
||||
|
||||
if not args.quiet:
|
||||
print('Statistics :\n')
|
||||
for key in sorted(statistics.keys()):
|
||||
print('\t.%s\t=> %d' % (key, statistics[key]))
|
Loading…
Reference in New Issue
Block a user