2016-09-12 19:37:14 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright 2016 Grégory Soutadé
This file is part of Photorec Stage 2.
Photorec Stage 2 is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
Photorec Stage 2 is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with Photorec Stage 2. If not , see < http : / / www . gnu . org / licenses / > .
"""
import os
from zipfile import ZipFile
import argparse
import shutil
from datetime import datetime
2025-04-02 17:01:18 +02:00
import re
2016-09-12 19:37:14 +02:00
OOO_CREATION_DATE_START = ' <meta:creation-date> '
OOO_CREATION_DATE_END = ' </meta:creation-date> '
MS_CREATION_DATE_START = ' <dcterms:created xsi:type= " dcterms:W3CDTF " > '
MS_CREATION_DATE_END = ' </dcterms:created> '
OPEN_OFFICE_MIME_TYPES = {
' application/vnd.oasis.opendocument.text ' : ' odt ' ,
' application/vnd.oasis.opendocument.spreadsheet ' : ' ods ' ,
' application/vnd.oasis.opendocument.presentation ' : ' odp ' ,
}
statistics = { }
def remove_extension ( filename ) :
point_pos = filename . rfind ( ' . ' )
if point_pos == - 1 :
return ( filename , ' ' )
extension = filename [ point_pos + 1 : ]
filename = filename [ : point_pos ]
return filename , extension
def copy_file ( orig_filepath , filename , outdir , verbose , extension = None ) :
if not os . path . exists ( outdir ) :
if verbose :
2025-04-02 17:01:18 +02:00
print ( f ' mkdirs { outdir } ' )
2016-09-12 19:37:14 +02:00
os . makedirs ( outdir )
if not extension :
filename , extension = remove_extension ( filename )
if os . path . exists ( os . path . join ( outdir , ' %s . %s ' % ( filename , extension ) ) ) :
cur_num = 2
while os . path . exists ( os . path . join ( outdir , ' %s _ %d . %s ' % ( filename , cur_num , extension ) ) ) :
cur_num = cur_num + 1
filename = os . path . join ( outdir , ' %s _ %d . %s ' % ( filename , cur_num , extension ) )
else :
filename = os . path . join ( outdir , ' %s . %s ' % ( filename , extension ) )
if verbose :
2025-04-02 17:01:18 +02:00
print ( f ' \t Copy { orig_filepath } => { filename } ' )
2016-09-12 19:37:14 +02:00
statistics [ extension ] = statistics . get ( extension , 0 ) + 1
shutil . copy ( orig_filepath , filename )
def copy_datetime_file ( dt , orig_filepath , outdir , verbose , extension ) :
outdir = os . path . join ( outdir , str ( dt . year ) , ' %02d ' % dt . month )
filename = ' %d _ %02d _ %02d ' % ( dt . year , dt . month , dt . day )
return copy_file ( orig_filepath , filename , outdir , verbose , extension )
def _try_open_office ( orig_filepath , zipfile , zipname , filename , outdir , verbose ) :
with zipfile . open ( zipname ) as mimetype :
mime = mimetype . read ( )
if not mime in OPEN_OFFICE_MIME_TYPES . keys ( ) : return False
ext = OPEN_OFFICE_MIME_TYPES [ mime ]
if verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Found { ext } file ' )
2016-09-12 19:37:14 +02:00
try :
meta = zipfile . open ( ' meta.xml ' )
except KeyError :
return False
outdir = os . path . join ( outdir , ' office ' )
metadata = meta . read ( )
try :
start = metadata . index ( OOO_CREATION_DATE_START )
end = metadata . index ( OOO_CREATION_DATE_END )
creation_date = datetime . strptime ( metadata [ start + len ( OOO_CREATION_DATE_START ) : end ] , ' % Y- % m- %d T % H: % M: % S ' )
copy_datetime_file ( creation_date , orig_filepath , outdir , verbose , ext )
except :
copy_file ( orig_filepath , remove_extension ( filename ) [ 0 ] , outdir , verbose , ext )
meta . close ( )
return True
return False
def _try_ms_office ( orig_filepath , zipfile , filename , outdir , verbose , extension ) :
try :
meta = zipfile . open ( ' docProps/core.xml ' )
except KeyError :
return False
outdir = os . path . join ( outdir , ' office ' )
metadata = meta . read ( )
try :
start = metadata . index ( MS_CREATION_DATE_START )
end = metadata . index ( MS_CREATION_DATE_END )
creation_date = datetime . strptime ( metadata [ start + len ( MS_CREATION_DATE_START ) : end ] [ : 19 ] , ' % Y- % m- %d T % H: % M: % S ' )
copy_datetime_file ( creation_date , orig_filepath , outdir , verbose , extension )
except :
copy_file ( orig_filepath , remove_extension ( filename ) [ 0 ] , outdir , verbose , extension )
meta . close ( )
return True
def manage_zip ( orig_filepath , filename , extension , outdir , verbose = False ) :
ret = False
with ZipFile ( orig_filepath , ' r ' ) as myzip :
for name in myzip . namelist ( ) :
# Maybe an Open Document file
if name == ' mimetype ' :
ret = _try_open_office ( orig_filepath , myzip , name , filename , outdir , verbose )
if ret : return ret
if name . startswith ( ' word/ ' ) :
ret = _try_ms_office ( orig_filepath , myzip , filename , outdir , verbose , ' docx ' )
if ret : return ret
if name . startswith ( ' xl/ ' ) :
ret = _try_ms_office ( orig_filepath , myzip , filename , outdir , verbose , ' xslx ' )
if ret : return ret
if name . startswith ( ' ppt/ ' ) :
ret = _try_ms_office ( orig_filepath , myzip , filename , outdir , verbose , ' pptx ' )
if ret : return ret
return ret
def _escape_name ( name ) :
name = name . replace ( ' / ' , ' ' )
name = name . replace ( ' \\ ' , ' ' )
return name
def _append_tag ( filename , tag ) :
if not tag : return filename
if filename :
filename = ' %s - ' % filename
if type ( tag ) == int :
filename = ' %s %d ' % ( filename , tag )
else :
filename = ' %s %s ' % ( filename , _escape_name ( tag ) )
return filename
def manage_audio ( orig_filepath , filename , extension , outdir , verbose = False ) :
tags = None
try :
tags = eyed3 . load ( orig_filepath ) . tag
except :
return False
if not tags or ( not tags . artist and not tags . album and not tags . title ) :
return False
outdir = os . path . join ( outdir , ' multimedia ' , ' audio ' )
new_filename = ' '
if tags . track_num [ 0 ] : new_filename = _append_tag ( ' ' , tags . track_num [ 0 ] )
new_filename = _append_tag ( new_filename , tags . title )
new_filename = ' %s . %s ' % ( new_filename , extension )
subdir = False
if tags . artist :
outdir = os . path . join ( outdir , _escape_name ( tags . artist ) )
subdir = True
if tags . album :
outdir = os . path . join ( outdir , _escape_name ( tags . album ) )
subdir = True
if not subdir :
outdir = os . path . join ( outdir , ' unknown ' )
copy_file ( orig_filepath , new_filename , outdir , verbose )
return True
def manage_picture ( orig_filepath , filename , extension , outdir , verbose = False ) :
tags = None
with open ( orig_filepath , ' rb ' ) as f :
try :
tags = exifread . process_file ( f )
except :
return False
outdir = os . path . join ( outdir , ' multimedia ' , ' pictures ' )
dt = None
try :
dt = datetime . strptime ( tags [ ' EXIF DateTimeOriginal ' ] . values , ' % Y: % m: %d % H: % M: % S ' )
2025-04-02 17:01:18 +02:00
except KeyError as e : # No 'EXIF DateTimeOriginal'
2016-09-12 19:37:14 +02:00
return False
except :
2025-04-02 17:01:18 +02:00
print ( f ' Invalid date format \' { tags [ ' EXIF DateTimeOriginal ' ] } \' ' )
2016-09-12 19:37:14 +02:00
return False
copy_datetime_file ( dt , orig_filepath , outdir , verbose , extension )
return True
_multiplier_bytes = ( 1 , 1024 , 1024 * 1024 , 1024 * 1024 * 1024 )
_multiplier_bits = ( 1 , 1000 , 1000 * 1000 , 1000 * 1000 * 1000 )
def parse_min_sizes ( min_sizes_comma ) :
if not min_sizes_comma : return { }
min_sizes = { }
for element in min_sizes_comma . split ( ' , ' ) :
sub_elements = element . split ( ' : ' )
if len ( sub_elements ) != 2 :
raise Exception ( ' Invalid parameter --min-size \' %s \' ' % ( element ) )
extension = sub_elements [ 0 ]
size = sub_elements [ 1 ]
multiplier_idx = 0
multiplier_tab = _multiplier_bytes
if size [ - 1 : ] == ' b ' or size [ - 1 : ] == ' B ' :
size = size [ : - 1 ]
if size [ - 1 : ] == ' i ' :
size = size [ : - 1 ]
multiplier_tab = _multiplier_bits
if size [ - 1 : ] == ' k ' or size [ - 1 : ] == ' K ' :
multiplier_idx = 1
size = size [ : - 1 ]
elif size [ - 1 : ] == ' m ' or size [ - 1 : ] == ' M ' :
multiplier_idx = 2
size = size [ : - 1 ]
elif size [ - 1 : ] == ' g ' or size [ - 1 : ] == ' G ' :
multiplier_idx = 2
size = size [ : - 1 ]
try :
size = int ( size )
except :
raise Exception ( ' Invalid parameter --min-size \' %s \' ' % ( element ) )
min_sizes [ extension ] = size * multiplier_tab [ multiplier_idx ]
return min_sizes
parser = argparse . ArgumentParser ( description = ' Photorec post script analysis: try to recover filename with metadata (supported files : docx,xslx,pptx,odt,ods,odp,mp3,jpg). ' )
parser . add_argument ( ' --in ' , dest = ' in_dir ' , help = ' Directory in (with photorec results) ' , required = True )
parser . add_argument ( ' --out ' , dest = ' out_dir ' , help = ' Directory out (script results) ' , required = True )
parser . add_argument ( ' --max-files-per-temp ' , dest = ' max_files_per_temp ' ,
2025-04-02 17:01:18 +02:00
help = ' Maximum unknown files in temprorary directory, -1 for no temp dir (default) ' ,
type = int , default = - 1 )
2016-09-12 19:37:14 +02:00
parser . add_argument ( ' --skip-ext ' , dest = ' skip_ext ' , help = ' Don \' t copy some extensions (comma separated eg : mp3,txt,doc) ' , default = ' ' )
parser . add_argument ( ' --only-ext ' , dest = ' only_ext ' , help = ' Copy some extensions (comma separated eg : mp3,txt,doc) ' , default = ' ' )
parser . add_argument ( ' --min-size ' , dest = ' min_size ' , help = ' Minimum size for an extension (comma separated eg : mp3:1M,txt:4k,doc:8) ' , default = ' ' )
parser . add_argument ( ' --verbose ' , dest = ' verbose ' , action = ' store_true ' , default = False , help = ' Verbose mode ' )
parser . add_argument ( ' --quiet ' , dest = ' quiet ' , action = ' store_true ' , default = False , help = ' Quiet mode ' )
args = parser . parse_args ( )
file_ops = {
' zip ' : manage_zip ,
2025-04-02 17:01:18 +02:00
' odt ' : manage_zip ,
' ods ' : manage_zip ,
' odp ' : manage_zip ,
' docx ' : manage_zip ,
' xlsx ' : manage_zip ,
' pptx ' : manage_zip ,
2016-09-12 19:37:14 +02:00
}
try :
import eyed3
file_ops [ ' mp3 ' ] = manage_audio
except :
2025-04-02 17:01:18 +02:00
print ( ' Package eyed3 not installed, mp3 format not supported. Use pip install eyed3 \n ' )
2016-09-12 19:37:14 +02:00
try :
import exifread
2025-04-02 17:01:18 +02:00
file_ops [ ' jpg ' ] = manage_picture
file_ops [ ' jpeg ' ] = manage_picture
file_ops [ ' JPG ' ] = manage_picture
file_ops [ ' JPEG ' ] = manage_picture
2016-09-12 19:37:14 +02:00
except :
2025-04-02 17:01:18 +02:00
print ( ' Package exifread not installed, jpg format not supported. Use pip install exifread \n ' )
2016-09-12 19:37:14 +02:00
file_ops_keys = file_ops . keys ( )
cur_out_dir = 0
cur_files_in_out_dir = 0
skip_exts = args . skip_ext and args . skip_ext . split ( ' , ' ) or None
only_exts = args . only_ext and args . only_ext . split ( ' , ' ) or None
min_sizes = parse_min_sizes ( args . min_size )
min_sizes_keys = min_sizes . keys ( )
2025-04-02 17:01:18 +02:00
filename_re = re . compile ( r " f \ d+_(.+) " )
2016-09-12 19:37:14 +02:00
# Disable (force) verbose on quiet
if args . quiet : args . verbose = False
if args . max_files_per_temp == - 1 :
outdir = args . out_dir
for root , dirs , files in os . walk ( args . in_dir ) :
for filename in files :
full_path = os . path . join ( root , filename )
_ , cur_extension = remove_extension ( filename )
# Only some extensions
if only_exts and cur_extension not in only_exts :
if args . verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Skipping { full_path } (only extension) ' )
2016-09-12 19:37:14 +02:00
continue
# Skipping some extensions
if skip_exts and cur_extension in skip_exts :
if args . verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Skipping { full_path } (skip extension) ' )
2016-09-12 19:37:14 +02:00
continue
# Min sizes
if min_sizes and cur_extension in min_sizes_keys :
statinfo = os . stat ( full_path )
if statinfo . st_size < min_sizes [ cur_extension ] :
if args . verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Skipping { full_path } (min size) ' )
2016-09-12 19:37:14 +02:00
continue
2025-04-02 17:01:18 +02:00
# Filename already found
m = filename_re . match ( filename )
if m :
copy_file ( full_path , m . group ( 1 ) , outdir , args . verbose )
continue
2016-09-12 19:37:14 +02:00
# Filtered files
if cur_extension in file_ops_keys :
if args . verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Filter \' { full_path } \' ' )
2016-09-12 19:37:14 +02:00
ret = file_ops [ cur_extension ] ( full_path , filename , cur_extension ,
args . out_dir , args . verbose )
if ret : continue
# Simple copy
if args . max_files_per_temp != - 1 :
if cur_files_in_out_dir == args . max_files_per_temp :
cur_files_in_out_dir = 0
cur_out_dir = cur_out_dir + 1
outdir = os . path . join ( args . out_dir , str ( cur_out_dir ) )
cur_files_in_out_dir = cur_files_in_out_dir + 1
if args . verbose :
2025-04-02 17:01:18 +02:00
print ( f ' Std copy { full_path } => { outdir } / { filename } ' )
2016-09-12 19:37:14 +02:00
copy_file ( full_path , filename , outdir , verbose = False )
if not args . quiet :
print ( ' Statistics : \n ' )
for key in sorted ( statistics . keys ( ) ) :
2025-04-02 17:01:18 +02:00
print ( f ' \t . { key } \t => { statistics [ key ] } ' )