From 39e2f6ecc9b8087ccfdcd6f6e0c1c089b1690b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Soutad=C3=A9?= Date: Sat, 21 Aug 2021 18:22:58 +0200 Subject: [PATCH] Initial commit --- .gitignore | 5 + LICENSE | 165 ++++++++++ Makefile | 53 ++++ README.md | 33 ++ include/uPDFObject.h | 150 +++++++++ include/uPDFParser.h | 109 +++++++ include/uPDFParser_common.h | 72 +++++ include/uPDFTypes.h | 253 +++++++++++++++ src/uPDFParser.cpp | 616 ++++++++++++++++++++++++++++++++++++ src/uPDFTypes.cpp | 105 ++++++ 10 files changed, 1561 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 include/uPDFObject.h create mode 100644 include/uPDFParser.h create mode 100644 include/uPDFParser_common.h create mode 100644 include/uPDFTypes.h create mode 100644 src/uPDFParser.cpp create mode 100644 src/uPDFTypes.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..744e4e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*~ +libupdfparser.a +libupdfparser.so + + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a04128 --- /dev/null +++ b/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..acea3de --- /dev/null +++ b/Makefile @@ -0,0 +1,53 @@ + +AR ?= $(CROSS)ar +CXX ?= $(CROSS)g++ + +CXXFLAGS=-Wall -fPIC -I./include +LDFLAGS= + +BUILD_STATIC ?= 0 +BUILD_SHARED ?= 1 + +TARGETS = +ifneq (BUILD_STATIC, 0) + TARGETS += libupdfparser.a +endif +ifneq (BUILD_SHARED, 0) + TARGETS += libupdfparser.so +endif + +ifneq ($(DEBUG),) +CXXFLAGS += -ggdb -O0 +else +CXXFLAGS += -O2 +endif + +SRCDIR := src +INCDIR := inc +BUILDDIR := obj +TARGETDIR := bin +SRCEXT := cpp +OBJEXT := o + +SOURCES = src/uPDFParser.cpp src/uPDFTypes.cpp +OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.$(OBJEXT))) + +all: obj $(TARGETS) + +obj: + mkdir obj + +$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(SRCEXT) + $(CXX) $(CXXFLAGS) -c $^ -o $@ + +libupdfparser.a: $(OBJECTS) + $(AR) crs $@ obj/*.o + +libupdfparser.so: $(OBJECTS) + $(CXX) obj/*.o $(LDFLAGS) -o $@ -shared + +test: test.c libupdfparser.a + g++ -ggdb -O0 $^ -o $@ -Iinclude libupdfparser.a + +clean: + rm -rf libupdfparser.so libupdfparser.a obj diff --git a/README.md b/README.md new file mode 100644 index 0000000..447ad84 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +Introduction +------------ + +A very simple PDF parser that will load PDF objects without interpretation (zlib, streams, string encoding...). +It currently only allows to update PDF file with new objects. + + +Compilation +----------- + +Use _make_ command + + make [CROSS=XXX] [DEBUG=1] [BUILD_STATIC=(0|1)] [BUILD_SHARED=(0|1)] + +CROSS can define a cross compiler prefix (ie arm-linux-gnueabihf-) + +DEBUG can be set to compile in DEBUG mode + +BUILD_STATIC build libupdfparser.a if 1, nothing if 0 (default value), can be combined with BUILD_SHARED + +BUILD_SHARED build libupdfparser.so if 1 (default value), nothing if 0, can be combined with BUILD_STATIC + + +Copyright +--------- + +Grégory Soutadé + + +License +------- + +LGPL v3 or later diff --git a/include/uPDFObject.h b/include/uPDFObject.h new file mode 100644 index 0000000..b80c7f7 --- /dev/null +++ b/include/uPDFObject.h @@ -0,0 +1,150 @@ +/* + Copyright 2021 Grégory Soutadé + + This file is part of uPDFParser. + + uPDFParser is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + uPDFParser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with uPDFParser. If not, see . +*/ + +#ifndef _UPDFOBJECT_HPP_ +#define _UPDFOBJECT_HPP_ + +#include "uPDFTypes.h" + +namespace uPDFParser +{ + /** + * @brief PDF Object + */ + class Object + { + public: + Object(): + _objectId(0), _generationNumber(0), + offset(0), _isNew(false), indirectOffset(0) + {} + + /** + * @brief Object constructor + * + * @param objectId Object ID + * @param generationNumber Object generation number + * @param offset Offset of object in current PDF file + * @param isNew false if object has been read from file, + * true if it has been created or updated + * @param indirectOffset Object is indirect + */ + Object(int objectId, int generationNumber, uint64_t offset, bool isNew=false, + off_t indirectOffset=0): + _objectId(objectId), _generationNumber(generationNumber), + offset(offset), _isNew(isNew), indirectOffset(indirectOffset) + {} + + ~Object() + { + std::vector::iterator it; + for(it=_data.begin(); it!=_data.end(); it++) + delete *it; + } + + Object(const Object& other) + { + _objectId = other._objectId; + _generationNumber = other._generationNumber; + offset = other.offset; + indirectOffset = other.indirectOffset; + _isNew = true; + + std::vector::const_iterator it; + for(it=other._data.begin(); it!=other._data.end(); it++) + _data.push_back((*it)->clone()); + + const std::map _dict = ((Dictionary)other._dictionary).value(); + std::map& _myDict = _dictionary.value(); + std::map::const_iterator it2; + for(it2=_dict.begin(); it2!=_dict.end(); it2++) + _myDict[it2->first] = it2->second->clone(); + } + + /** + * @brief Clone current object (call copy constructor) + */ + Object* clone() { return new Object(*this); } + + /** + * @brief Return internal dictionary + */ + Dictionary& dictionary() {return _dictionary;} + + /** + * @brief Return vector of data contained into object + */ + std::vector& data() {return _data;} + + /** + * @brief Object string representation + */ + std::string str(); + + /** + * @brief Set object as indirect if offset != 0 or not indirect if offset == 0 + */ + void setIndirectOffset(off_t offset) {indirectOffset = offset;} + + /** + * @brief is object indirect (indirectOffset != 0) + */ + bool isIndirect() {return indirectOffset != 0;} + + /** + * @brief Get dictionary value + */ + DataType*& operator[](const std::string& key) { return _dictionary.value()[key]; } + + /** + * @brief Check for key in object's dictionary + */ + bool hasKey(const std::string& key) { return _dictionary.value().count(key)?true:false; } + + /** + * @brief is object new (or not updated) ? + */ + bool isNew() { return _isNew; } + + /** + * @brief Mark object as updated + */ + void update(void) { _isNew = true; } + + /** + * @brief Return object's id + */ + int objectId() { return _objectId; } + + /** + * @brief Return object's generation number + */ + int generationNumber() { return _generationNumber; } + + private: + int _objectId; + int _generationNumber; + off_t offset; + bool _isNew; + off_t indirectOffset; + Dictionary _dictionary; + std::vector _data; + }; +} +#endif diff --git a/include/uPDFParser.h b/include/uPDFParser.h new file mode 100644 index 0000000..3573943 --- /dev/null +++ b/include/uPDFParser.h @@ -0,0 +1,109 @@ +/* + Copyright 2021 Grégory Soutadé + + This file is part of uPDFParser. + + uPDFParser is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + uPDFParser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with uPDFParser. If not, see . +*/ + +#ifndef _UPDFPARSER_HPP_ +#define _UPDFPARSER_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "uPDFTypes.h" +#include "uPDFObject.h" + +namespace uPDFParser +{ + /** + * @brief PDF Parser + */ + class Parser + { + public: + Parser(): + fd(0) + {} + + ~Parser() + { + if (fd) close(fd); + + std::vector::iterator it; + for(it=_objects.begin(); it!=_objects.end(); it++) + delete *it; + } + + /** + * @brief Parse a file + */ + void parse(const std::string& filename); + + /** + * @brief Write a PDF file with internal objects + * + * @param filename File path + * @param update Only append new objects if true + * Write a new PDF file if false (not supported for now) + */ + void write(const std::string& filename, bool update=false); + + /** + * @brief Get internals (or parsed) objects + */ + std::vector& objects() { return _objects; } + + /** + * @brief Add an object + */ + void addObject(Object* object) { _objects.push_back(object); } + + private: + void parseObject(std::string& token); + void parseXref(); + void parseTrailer(); + + std::string nextToken(bool exceptionOnEOF=true); + + DataType* parseType(std::string& token, Object* object, std::map& dict); + void parseDictionary(Object* object, std::map& dict); + DataType* parseSignedNumber(std::string& token); + DataType* parseNumber(std::string& token); + DataType* parseNumberOrReference(std::string& token); + Array* parseArray(Object* object); + String* parseString(); + HexaString* parseHexaString(); + Stream* parseStream(); + Name* parseName(std::string& token); + + void writeUpdate(const std::string& filename); + + std::vector _objects; + Object trailer; + off_t xrefOffset; + int fd; + off_t curOffset; + }; +} + +#endif diff --git a/include/uPDFParser_common.h b/include/uPDFParser_common.h new file mode 100644 index 0000000..77cc6a0 --- /dev/null +++ b/include/uPDFParser_common.h @@ -0,0 +1,72 @@ +#ifndef _UPDFPARSER_COMMON_HPP_ +#define _UPDFPARSER_COMMON_HPP_ + +#include +#include +#include + +namespace uPDFParser +{ + enum PARSING_ERROR { + UNABLE_TO_OPEN_FILE = 1, + TRUNCATED_FILE, + INVALID_HEADER, + INVALID_LINE, + INVALID_FOOTER, + INVALID_DICTIONARY, + INVALID_NAME, + INVALID_BOOLEAN, + INVALID_NUMBER, + INVALID_STREAM, + INVALID_TOKEN, + INVALID_OBJECT, + INVALID_TRAILER, + INVALID_HEXASTRING, + NOT_IMPLEMENTED + + }; + + /** + * @brief Exception class + */ + class Exception : public std::exception + { + public: + + Exception(int code, const char* message, const char* file, int line): + code(code), line(line), file(file) + { + std::stringstream msg; + msg << "Exception code : 0x" << std::setbase(16) << code << std::endl; + msg << "Message : " << message << std::endl; + msg << "File : " << file << ":" << std::setbase(10) << line << std::endl; + fullmessage = strdup(msg.str().c_str()); + } + + Exception(const Exception& other) + { + this->code = other.code; + this->line = line; + this->file = file; + this->fullmessage = strdup(other.fullmessage); + } + + ~Exception() + { + free(fullmessage); + } + + const char * what () const throw () { return fullmessage; } + + int getErrorCode() {return code;} + + private: + int code, line; + const char* message, *file; + char* fullmessage; + }; + +#define EXCEPTION(code, message) \ + {std::stringstream __msg;__msg << message; throw uPDFParser::Exception(code, __msg.str().c_str(), __FILE__, __LINE__);} +} +#endif diff --git a/include/uPDFTypes.h b/include/uPDFTypes.h new file mode 100644 index 0000000..94e7659 --- /dev/null +++ b/include/uPDFTypes.h @@ -0,0 +1,253 @@ +/* + Copyright 2021 Grégory Soutadé + + This file is part of uPDFParser. + + uPDFParser is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + uPDFParser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with uPDFParser. If not, see . +*/ + +#ifndef _UPDFTYPES_HPP_ +#define _UPDFTYPES_HPP_ + +#include +#include +#include +#include +#include + +namespace uPDFParser +{ + /** + * @brief Base class for PDF object type + * From https://resources.infosecinstitute.com/topic/pdf-file-format-basic-structure/ + */ + class DataType + { + public: + enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM}; + + DataType(TYPE _type): + _type(_type) + {} + + virtual ~DataType() {} + + /** + * @brief Get current data type + */ + TYPE type() { return _type; } + + /** + * @brief String representation for serialization + */ + virtual std::string str() = 0; + + /** + * @brief Clone current object + */ + virtual DataType* clone() = 0; + + protected: + TYPE _type; + }; + + class Boolean : public DataType + { + public: + Boolean(bool value): + DataType(DataType::TYPE::BOOLEAN), _value(value) + {} + + virtual DataType* clone() {return new Boolean(_value);} + bool value() {return _value;} + virtual std::string str() { return (_value)?" true":" false";} + + private: + bool _value; + }; + + class Integer : public DataType + { + public: + Integer(int value, bool _signed=false): + DataType(DataType::TYPE::INTEGER), _value(value), _signed(_signed) + {} + + virtual DataType* clone() {return new Integer(_value, _signed);} + int value() {return _value;} + virtual std::string str(); + + private: + int _value; + bool _signed; + }; + + class Real : public DataType + { + public: + Real(float value, bool _signed=false): + DataType(DataType::TYPE::REAL), _value(value), _signed(_signed) + {} + + virtual DataType* clone() {return new Real(_value, _signed);} + float value() {return _value;} + virtual std::string str(); + + private: + float _value; + bool _signed; + }; + + class Name : public DataType + { + public: + Name(const std::string&); + + virtual DataType* clone() {return new Name(_value);} + std::string value() { + const char* name = _value.c_str(); + return std::string(&name[1]); + } + virtual std::string str() { return _value;} + + private: + std::string _value; + }; + + class String : public DataType + { + public: + String(const std::string&); + + virtual DataType* clone() {return new String(_value);} + std::string value() {return _value;} + + // Escape '(' and ')' characters + virtual std::string str() { + char prev = '\0'; + std::string res("("); + + for(unsigned int i=0; i<_value.size(); i++) + { + if ((_value[i] == '(' || _value[i] == ')') && + prev != '\\') + res += '\\'; + res += _value[i]; + prev = _value[i]; + } + + res += ")"; + return res; + } + + private: + std::string _value; + }; + + class HexaString : public DataType + { + public: + HexaString(const std::string&); + + virtual DataType* clone() {return new HexaString(_value);} + std::string value() {return _value;} + virtual std::string str() { return std::string("<") + _value + std::string(">");} + + private: + std::string _value; + }; + + class Reference : public DataType + { + public: + Reference(int objectId, int generationNumber): + DataType(DataType::TYPE::REFERENCE), objectId(objectId), generationNumber(generationNumber) + {} + + virtual DataType* clone() {return new Reference(objectId, generationNumber);} + int value() {return objectId;} + virtual std::string str() { + std::stringstream res; + res << " " << objectId << " " << generationNumber << " R"; + return res.str(); + } + + private: + int objectId, generationNumber; + }; + + class Array : public DataType + { + public: + Array(): + DataType(DataType::TYPE::ARRAY) + {} + + void addData(DataType* data) {_value.push_back(data);} + + virtual DataType* clone() { + Array* res = new Array(); + std::vector::iterator it; + for(it=_value.begin(); it!=_value.end(); it++) + res->addData((*it)->clone()); + return res; + } + std::vector& value() {return _value;} + virtual std::string str(); + + private: + std::vector _value; + }; + + class Dictionary : public DataType + { + public: + Dictionary(): + DataType(DataType::TYPE::DICTIONARY) + {} + + void addData(const std::string&, DataType*); + + virtual DataType* clone() { + Dictionary* res = new Dictionary(); + std::map::iterator it; + for(it=_value.begin(); it!=_value.end(); it++) + { + res->addData(it->first, it->second->clone()); + } + return res; + } + std::map& value() {return _value;} + virtual std::string str(); + + private: + std::map _value; + }; + + class Stream : public DataType + { + public: + Stream(int startOffset, int endOffset): + DataType(DataType::TYPE::STREAM), startOffset(startOffset), + endOffset(endOffset) + {} + virtual DataType* clone() {return new Stream(startOffset, endOffset);} + virtual std::string str() { return "stream\nendstream\n";} + + private: + int startOffset, endOffset; + }; +} + +#endif diff --git a/src/uPDFParser.cpp b/src/uPDFParser.cpp new file mode 100644 index 0000000..1e3e732 --- /dev/null +++ b/src/uPDFParser.cpp @@ -0,0 +1,616 @@ +/* + Copyright 2021 Grégory Soutadé + + This file is part of uPDFParser. + + uPDFParser is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + uPDFParser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with uPDFParser. If not, see . +*/ + +#include +#include +#include +#include +#include + +#include "uPDFParser.h" +#include "uPDFParser_common.h" + +namespace uPDFParser +{ + std::string Object::str() + { + std::stringstream res; + + res << _objectId << " " << _generationNumber << " obj\n"; + res << _dictionary.str(); + + std::vector::iterator it; + for(it=_data.begin(); it!=_data.end(); it++) + res << (*it)->str(); + + res << "endobj\n"; + + return res.str(); + } + + /** + * @brief Read data until '\n' or '\r' is found or buffer is full + */ + static inline int readline(int fd, char* buffer, int size, bool exceptionOnEOF=true) + { + int res = 0; + char c; + + buffer[0] = 0; + + for (;size;size--,res++) + { + if (read(fd, &c, 1) != 1) + { + if (exceptionOnEOF) + EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); + return -1; + } + + if (c == '\n' || c == '\r') + break; + + buffer[res] = c; + } + + if (size) + buffer[res] = 0; + + return res; + } + + /** + * @brief Read data until EOF, '\n' or '\r' is found + */ + static inline void finishLine(int fd) + { + char c; + + while (1) + { + if (read(fd, &c, 1) != 1) + break; + + if (c == '\n') + break; + } + } + + /** + * @brief Find next token to analyze + */ + std::string Parser::nextToken(bool exceptionOnEOF) + { + char c; + std::string res(""); + int i; + static const char delims[] = " \t<>[]()+-/"; + static const char start_delims[] = "<>[]()"; + bool found = false; + + while (!found) + { + if (read(fd, &c, 1) != 1) + { + if (exceptionOnEOF) + EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); + break; + } + + // Comment, skip line + if (c == '%') + { + finishLine(fd); + break; + } + + // White character while empty result, continue + if ((c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0') && !res.size()) + continue; + + // Quit on line return without lseek(fd, -1, SEEK_CUR) + if (c == '\n' || c == '\r') + { + if (res.size()) + break; + else + continue; + } + + if (res.size()) + { + // Push character until delimiter is found + for (i=0; i<(int)sizeof(delims); i++) + { + if (c == delims[i]) + { + lseek(fd, -1, SEEK_CUR); + found = true; + break; + } + } + + if (!found) + res += c; + } + else + { + curOffset = lseek(fd, 0, SEEK_CUR)-1; + + // First character, is it a delimiter ? + for (i=0; i<(int)sizeof(start_delims); i++) + { + if (c == start_delims[i]) + { + found = true; + break; + } + } + + res += c; + } + } + + // Double '>' and '<' to compute dictionary + if (res == ">" || res == "<") + { + if (read(fd, &c, 1) == 1) + { + if (c == res[0]) + res += c; + else + lseek(fd, -1, SEEK_CUR); + } + } + + return res; + } + + void Parser::parseTrailer() + { + std::string token; + char buffer[10]; + + // std::cout << "Parse trailer" << std::endl; + + token = nextToken(); + + if (token != "<<") + EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); + + parseDictionary(&trailer, trailer.dictionary().value()); + + token = nextToken(); + if (token != "startxref") + EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); + + token = nextToken(); + readline(fd, buffer, sizeof(buffer), false); + if (strncmp(buffer, "%%EOF", 5)) + EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); + } + + void Parser::parseXref() + { + std::string token; + + // std::cout << "Parse xref" << std::endl; + xrefOffset = curOffset; + + while (1) + { + token = nextToken(); + + if (token == "trailer") + { + parseTrailer(); + break; + } + } + } + + static DataType* tokenToNumber(std::string& token, char sign='\0') + { + int i; + float fvalue; + int ivalue; + + for(i=0; i<(int)token.size(); i++) + { + if (token[i] == '.') + { + if (i==0) token = std::string("0") + token; + fvalue = std::stof(token); + if (sign == '-') + fvalue = -fvalue; + return new Real(fvalue, (sign!='\0')); + } + } + + ivalue = std::stoi(token); + if (sign == '-') + ivalue = -ivalue; + + return new Integer(ivalue, (sign!='\0')); + } + + DataType* Parser::parseSignedNumber(std::string& token) + { + char sign = token[0]; + token = std::string(&((token.c_str())[1])); + return tokenToNumber(token, sign); + } + + DataType* Parser::parseNumber(std::string& token) + { + return tokenToNumber(token); + } + + DataType* Parser::parseNumberOrReference(std::string& token) + { + DataType* res = tokenToNumber(token); + + if (res->type() == DataType::TYPE::REAL) + return res; + + off_t offset = lseek(fd, 0, SEEK_CUR); + std::string token2 = nextToken(); + std::string token3 = nextToken(); + + DataType* generationNumber = 0; + try + { + generationNumber = tokenToNumber(token2); + } + catch (std::invalid_argument& e) + { + lseek(fd, offset, SEEK_SET); + return res; + } + + if ((generationNumber->type() != DataType::TYPE::INTEGER) || + token3.size() != 1 || token3[0] != 'R') + { + delete generationNumber; + lseek(fd, offset, SEEK_SET); + return res; + } + + DataType* res2 = new Reference(((Integer*)res)->value(), + ((Integer*)generationNumber)->value()); + delete res; + return res2; + } + + DataType* Parser::parseType(std::string& token, Object* object, std::map& dict) + { + DataType* value = 0; + Dictionary* _value = 0; + + if (token == "<<") + { + _value = new Dictionary(); + value = _value; + parseDictionary(object, _value->value()); + } + else if (token == "[") + value = parseArray(object); + else if (token == "(") + value = parseString(); + else if (token == "<") + value = parseHexaString(); + else if (token == "stream") + value = parseStream(); + else if (token[0] >= '1' && token[0] <= '9') + value = parseNumberOrReference(token); + else if (token[0] == '/') + value = parseName(token); + else if (token[0] == '+' || token[0] == '-') + value = parseSignedNumber(token); + else if (token[0] == '0' || token[0] == '.') + value = parseNumber(token); + else if (token == "true") + return new Boolean(true); + else if (token == "false") + return new Boolean(false); + else + EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset); + + return value; + } + + Array* Parser::parseArray(Object* object) + { + std::string token; + DataType* value; + + Array* res = new Array(); + + while (1) + { + token = nextToken(); + + if (token == "]") + break; + + value = parseType(token, object, object->dictionary().value()); + //std::cout << "Add " << value->str() << std::endl; + res->addData(value); + } + + return res; + } + + String* Parser::parseString() + { + std::string res(""); + char c; + bool escaped = false; + + while (1) + { + if (read(fd, &c, 1) != 1) + break; + + if (c == ')' && !escaped) + break; + + escaped = (c == '\\'); + + res += c; + } + + return new String(res); + } + + HexaString* Parser::parseHexaString() + { + std::string res(""); + char c; + + while (1) + { + if (read(fd, &c, 1) != 1) + break; + + if (c == '>') + break; + + res += c; + } + + if ((res.size() % 2)) + EXCEPTION(INVALID_HEXASTRING, "Invalid hexa String at offset " << curOffset); + + return new HexaString(res); + } + + Stream* Parser::parseStream() + { + char buffer[1024]; + off_t endOffset; + + while (1) + { + endOffset = lseek(fd, 0, SEEK_CUR); + readline(fd, buffer, sizeof(buffer)); + if (!strncmp(buffer, "endstream", 9)) + break; + } + + return new Stream(curOffset, endOffset); + } + + Name* Parser::parseName(std::string& name) + { + if (!name.size() || name[0] != '/') + EXCEPTION(INVALID_NAME, "Invalid Name at offset " << curOffset); + + //std::cout << "Name " << name << std::endl; + return new Name(name); + } + + void Parser::parseDictionary(Object* object, std::map& dict) + { + std::string token; + Name* key; + DataType* value; + + while (1) + { + token = nextToken(); + if (token == ">>") + break; + + key = parseName(token); + + token = nextToken(); + if (token == ">>") + { + dict[key->value()] = 0; + break; + } + + value = parseType(token, object, dict); + dict[key->value()] = value; + } + } + + void Parser::parseObject(std::string& token) + { + off_t offset; + int objectId, generationNumber; + Object* object; + + offset = curOffset; + try + { + objectId = std::stoi(token); + token = nextToken(); + generationNumber = std::stoi(token); + } + catch(std::invalid_argument& e) + { + EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); + } + + token = nextToken(); + + if (token != "obj") + EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); + + std::cout << "New obj " << objectId << " " << generationNumber << std::endl; + + object = new Object(objectId, generationNumber, offset); + _objects.push_back(object); + + while (1) + { + token = nextToken(); + + if (token == "endobj") + break; + + if (token == "<<") + parseDictionary(object, object->dictionary().value()); + else if (token[0] >= '1' && token[0] <= '9') + { + DataType* _offset = tokenToNumber(token); + if (_offset->type() != DataType::TYPE::INTEGER) + EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); + object->setIndirectOffset(((Integer*)_offset)->value()); + } + else + parseType(token, object, object->dictionary().value()); + } + } + + void Parser::parse(const std::string& filename) + { + char buf[16]; + std::string token; + + if (fd) + close(fd); + + fd = open(filename.c_str(), O_RDONLY); + + if (fd <= 0) + EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); + + // Check %PDF at startup + readline(fd, buf, 4); + if (strncmp(buf, "%PDF", 4)) + EXCEPTION(INVALID_HEADER, "Invalid PDF header"); + finishLine(fd); + + curOffset = lseek(fd, 0, SEEK_CUR); + + // // Check %%EOF at then end + // lseek(fd, -5, SEEK_END); + // readline(fd, buf, 5); + // if (strncmp(buf, "%%EOF", 5)) + // EXCEPTION(INVALID_FOOTER, "Invalid PDF footer"); + + lseek(fd, curOffset, SEEK_SET); + + while (1) + { + token = nextToken(false); + + if (!token.size()) + break; + + if (token == "xref") + parseXref(); + else if (token[0] >= '1' && token[0] <= '9') + parseObject(token); + else + EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset); + } + + close(fd); + } + + void Parser::writeUpdate(const std::string& filename) + { + int newFd = open(filename.c_str(), O_WRONLY|O_APPEND|O_CREAT, S_IRUSR|S_IWUSR); + + if (newFd <= 0) + EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); + + ::write(newFd, "\r", 1); + + std::stringstream xref; + int nbNewObjects = 0; + + xref << std::setfill('0'); + xref << "xref\n"; + + std::vector::iterator it; + for(it=_objects.begin(); it!=_objects.end(); it++) + { + if (!(*it)->isNew()) + continue; + nbNewObjects ++; + std::string objStr = (*it)->str(); + curOffset = lseek(newFd, 0, SEEK_CUR); + ::write(newFd, objStr.c_str(), objStr.size()); + xref << std::setw(0) << (*it)->objectId() << " 1\n"; + xref << std::setw(10) << curOffset << " " << std::setw(5) << (*it)->generationNumber() << " n\r\n"; // Here \r seems important + } + + if (!nbNewObjects) + { + close(newFd); + return; + } + + off_t newXrefOffset = lseek(newFd, 0, SEEK_CUR); + + std::string xrefStr = xref.str(); + ::write(newFd, xrefStr.c_str(), xrefStr.size()); + + if (trailer.hasKey("Prev")) + delete trailer["Prev"]; + + trailer["Prev"] = new Integer((int)xrefOffset); + + std::string trailerStr = trailer.dictionary().str(); + ::write(newFd, "trailer\n", 8); + ::write(newFd, trailerStr.c_str(), trailerStr.size()); + + std::stringstream startxref; + startxref << "startxref\n" << newXrefOffset << "\n%%EOF"; + + std::string startxrefStr = startxref.str(); + ::write(newFd, startxrefStr.c_str(), startxrefStr.size()); + + close(newFd); + } + + void Parser::write(const std::string& filename, bool update) + { + if (update) + return writeUpdate(filename); + else + EXCEPTION(NOT_IMPLEMENTED, "Full write not implemented"); + } + +} diff --git a/src/uPDFTypes.cpp b/src/uPDFTypes.cpp new file mode 100644 index 0000000..9de275c --- /dev/null +++ b/src/uPDFTypes.cpp @@ -0,0 +1,105 @@ +/* + Copyright 2021 Grégory Soutadé + + This file is part of uPDFParser. + + uPDFParser is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + uPDFParser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with uPDFParser. If not, see . +*/ + +#include "uPDFTypes.h" +#include "uPDFParser_common.h" + +namespace uPDFParser +{ + Name::Name(const std::string& name): + DataType(DataType::TYPE::NAME) + { + _value = name; + } + + String::String(const std::string& value): + DataType(DataType::TYPE::STRING) + { + _value = value; + } + + HexaString::HexaString(const std::string& value): + DataType(DataType::TYPE::HEXASTRING) + { + _value = value; + } + + std::string Integer::str() + { + std::string sign(""); + if (_signed) + { + if (_value >= 0) + sign = "+"; + else + sign = "-"; + } + + return " " + sign + std::to_string(_value); + } + + std::string Real::str() + { + std::string sign(""); + if (_signed) + { + if (_value >= 0) + sign = "+"; + else + sign = "-"; + } + + return " " + sign + std::to_string(_value); + } + + std::string Array::str() + { + std::string res("["); + std::vector::iterator it; + + for(it = _value.begin(); it!=_value.end(); it++) + { + if (res.size() > 1) + res += " "; + res += (*it)->str(); + } + + return res + std::string("]"); + } + + void Dictionary::addData(const std::string& key, DataType* value) + { + _value[key] = value; + } + + std::string Dictionary::str() + { + std::string res("<<"); + std::map::iterator it; + + for(it = _value.begin(); it!=_value.end(); it++) + { + res += std::string("/") + it->first; + if (it->second) + res += it->second->str(); + } + + return res + std::string(">>\n"); + } +}