Initial commit

2021-08-21 18:22:58 +02:00
commit 39e2f6ecc9
10 changed files with 1561 additions and 0 deletions
--- a/src/uPDFParser.cpp
+++ b/src/uPDFParser.cpp
@@ -0,0 +1,616 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "uPDFParser.h"
+#include "uPDFParser_common.h"
+
+namespace uPDFParser
+{
+    std::string Object::str()
+    {
+	std::stringstream res;
+
+	res << _objectId << " " << _generationNumber << " obj\n";
+	res << _dictionary.str();
+
+	std::vector<DataType*>::iterator it;
+	for(it=_data.begin(); it!=_data.end(); it++)
+	    res << (*it)->str();
+
+	res << "endobj\n";
+
+	return res.str();
+    }
+
+    /**
+     * @brief Read data until '\n' or '\r' is found or buffer is full
+     */
+    static inline int readline(int fd, char* buffer, int size, bool exceptionOnEOF=true)
+    {
+	int res = 0;
+	char c;
+
+	buffer[0] = 0;
+	
+	for (;size;size--,res++)
+	{
+	    if (read(fd, &c, 1) != 1)
+	    {
+		if (exceptionOnEOF)
+		    EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
+		return -1;
+	    }
+
+	    if (c == '\n' || c == '\r')
+		break;
+
+	    buffer[res] = c;
+	}
+
+	if (size)
+	    buffer[res] = 0;
+	
+	return res;
+    }
+
+    /**
+     * @brief Read data until EOF, '\n' or '\r' is found
+     */
+    static inline void finishLine(int fd)
+    {
+	char c;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == '\n')
+		break;
+	}
+    }
+
+    /**
+     * @brief Find next token to analyze
+     */
+    std::string Parser::nextToken(bool exceptionOnEOF)
+    {
+	char c;
+	std::string res("");
+	int i;
+	static const char delims[] = " \t<>[]()+-/";
+	static const char start_delims[] = "<>[]()";
+	bool found = false;
+	
+	while (!found)
+	{
+	    if (read(fd, &c, 1) != 1)
+	    {
+		if (exceptionOnEOF)
+		    EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
+		break;
+	    }
+
+	    // Comment, skip line
+	    if (c == '%')
+	    {
+		finishLine(fd);
+		break;
+	    }
+
+	    // White character while empty result, continue
+	    if ((c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0') && !res.size())
+		continue;
+
+	    // Quit on line return without lseek(fd, -1, SEEK_CUR)
+	    if (c == '\n' || c == '\r')
+	    {
+		if (res.size())
+		    break;
+		else
+		    continue;
+	    }
+
+	    if (res.size())
+	    {
+		// Push character until delimiter is found
+		for (i=0; i<(int)sizeof(delims); i++)
+		{
+		    if (c == delims[i])
+		    {
+			lseek(fd, -1, SEEK_CUR);
+			found = true;
+			break;
+		    }
+		}
+		
+		if (!found)
+		    res += c;
+	    }
+	    else
+	    {
+		curOffset = lseek(fd, 0, SEEK_CUR)-1;
+
+		// First character, is it a delimiter ?
+		for (i=0; i<(int)sizeof(start_delims); i++)
+		{
+		    if (c == start_delims[i])
+		    {
+			found = true;
+			break;
+		    }
+		}
+
+		res += c;
+	    }
+	}
+
+	// Double '>' and '<' to compute dictionary
+	if (res == ">" || res == "<")
+	{
+	    if (read(fd, &c, 1) == 1)
+	    {
+		if (c == res[0])
+		    res += c;
+		else
+		    lseek(fd, -1, SEEK_CUR);
+	    }
+	}
+	
+	return res;
+    }
+
+    void Parser::parseTrailer()
+    {
+	std::string token;
+	char buffer[10];
+
+	// std::cout << "Parse trailer" << std::endl;
+
+	token = nextToken();
+
+	if (token != "<<")
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+
+	parseDictionary(&trailer, trailer.dictionary().value());
+
+	token = nextToken();
+	if (token != "startxref")
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+
+	token = nextToken();
+	readline(fd, buffer, sizeof(buffer), false);
+	if (strncmp(buffer, "%%EOF", 5))
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+    }
+    
+    void Parser::parseXref()
+    {
+	std::string token;
+
+	// std::cout << "Parse xref" << std::endl;
+	xrefOffset = curOffset;
+
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "trailer")
+	    {
+		parseTrailer();
+		break;
+	    }
+	}
+    }
+
+    static DataType* tokenToNumber(std::string& token, char sign='\0')
+    {
+	int i;
+	float fvalue;
+	int ivalue;
+	
+	for(i=0; i<(int)token.size(); i++)
+	{
+	    if (token[i] == '.')
+	    {
+		if (i==0) token = std::string("0") + token;
+		fvalue = std::stof(token);
+		if (sign == '-')
+		    fvalue = -fvalue;
+		return new Real(fvalue, (sign!='\0'));
+	    }
+	}
+
+	ivalue = std::stoi(token);
+	if (sign == '-')
+	    ivalue = -ivalue;
+	
+	return new Integer(ivalue, (sign!='\0'));
+    }
+    
+    DataType* Parser::parseSignedNumber(std::string& token)
+    {
+	char sign = token[0];
+	token = std::string(&((token.c_str())[1]));
+	return tokenToNumber(token, sign);
+    }
+    
+    DataType* Parser::parseNumber(std::string& token)
+    {
+	return tokenToNumber(token);
+    }
+
+    DataType* Parser::parseNumberOrReference(std::string& token)
+    {
+	DataType* res = tokenToNumber(token);
+
+	if (res->type() == DataType::TYPE::REAL)
+	    return res;
+	
+	off_t offset = lseek(fd, 0, SEEK_CUR);
+	std::string token2 = nextToken();
+	std::string token3 = nextToken();
+
+	DataType* generationNumber = 0;
+	try
+	{
+	    generationNumber = tokenToNumber(token2);
+	}
+	catch (std::invalid_argument& e)
+	{
+	    lseek(fd, offset, SEEK_SET);
+	    return res;
+	}
+	
+	if ((generationNumber->type() != DataType::TYPE::INTEGER) ||
+	    token3.size() != 1 || token3[0] != 'R')
+	{
+	    delete generationNumber;
+	    lseek(fd, offset, SEEK_SET);
+	    return res;
+	}
+
+	DataType* res2 = new Reference(((Integer*)res)->value(),
+				       ((Integer*)generationNumber)->value());
+	delete res;
+	return res2;
+    }
+    
+    DataType* Parser::parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict)
+    {
+	DataType* value = 0;
+	Dictionary* _value = 0;
+
+	if (token == "<<")
+	{
+	    _value = new Dictionary();
+	    value = _value;
+	    parseDictionary(object, _value->value());
+	}
+	else if (token == "[")
+	    value = parseArray(object);
+	else if (token == "(")
+	    value = parseString();
+	else if (token == "<")
+	    value = parseHexaString();
+	else if (token == "stream")
+	    value = parseStream();
+	else if (token[0] >= '1' && token[0] <= '9')
+	    value = parseNumberOrReference(token);
+	else if (token[0] == '/')
+	    value = parseName(token);
+	else if (token[0] == '+' || token[0] == '-')
+	    value = parseSignedNumber(token);
+	else if (token[0] == '0' || token[0] == '.')
+	    value = parseNumber(token);
+	else if (token == "true")
+	    return new Boolean(true);
+	else if (token == "false")
+	    return new Boolean(false);
+	else
+	    EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset);
+
+	return value;
+    }
+
+    Array* Parser::parseArray(Object* object)
+    {
+	std::string token;
+	DataType* value;
+
+	Array* res = new Array();
+	
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "]")
+		break;
+
+	    value = parseType(token, object, object->dictionary().value());
+	    //std::cout << "Add " << value->str() << std::endl;
+	    res->addData(value);
+	}
+
+	return res;
+    }
+    
+    String* Parser::parseString()
+    {
+	std::string res("");
+	char c;
+	bool escaped = false;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == ')' && !escaped)
+		break;
+
+	    escaped = (c == '\\');
+
+	    res += c;
+	}
+
+	return new String(res);
+    }
+    
+    HexaString* Parser::parseHexaString()
+    {
+	std::string res("");
+	char c;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == '>')
+		break;
+
+	    res += c;
+	}
+
+	if ((res.size() % 2))
+	    EXCEPTION(INVALID_HEXASTRING, "Invalid hexa String at offset " << curOffset);
+	    
+	return new HexaString(res);
+    }
+
+    Stream* Parser::parseStream()
+    {
+	char buffer[1024];
+	off_t endOffset;
+
+	while (1)
+	{
+	    endOffset = lseek(fd, 0, SEEK_CUR);
+	    readline(fd, buffer, sizeof(buffer));
+	    if (!strncmp(buffer, "endstream", 9))
+		break;
+	}
+
+	return new Stream(curOffset, endOffset);
+    }
+    
+    Name* Parser::parseName(std::string& name)
+    {
+	if (!name.size() || name[0] != '/')
+	    EXCEPTION(INVALID_NAME, "Invalid Name at offset " << curOffset);
+
+	//std::cout << "Name " << name << std::endl;
+	return new Name(name);
+    }
+   
+    void Parser::parseDictionary(Object* object, std::map<std::string, DataType*>& dict)
+    {
+	std::string token;
+	Name* key;
+	DataType* value;
+
+	while (1)
+	{
+	    token = nextToken();
+	    if (token == ">>")
+		break;
+
+	    key = parseName(token);
+
+	    token = nextToken();
+	    if (token == ">>")
+	    {
+		dict[key->value()] = 0;
+		break;
+	    }
+
+	    value = parseType(token, object, dict);
+	    dict[key->value()] = value;
+	}
+    }
+    
+    void Parser::parseObject(std::string& token)
+    {
+	off_t offset;
+	int objectId, generationNumber;
+	Object* object;
+
+	offset = curOffset;
+	try
+	{
+	    objectId = std::stoi(token);
+	    token = nextToken();
+	    generationNumber = std::stoi(token);
+	}
+	catch(std::invalid_argument& e)
+	{
+	    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+	}
+
+	token = nextToken();
+
+	if (token != "obj")
+	    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+
+	std::cout << "New obj " << objectId << " " << generationNumber << std::endl;
+	
+	object = new Object(objectId, generationNumber, offset);
+	_objects.push_back(object);
+
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "endobj")
+		break;
+
+	    if (token == "<<")
+		parseDictionary(object, object->dictionary().value());
+	    else if (token[0] >= '1' && token[0] <= '9')
+	    {
+		DataType* _offset = tokenToNumber(token);
+		if (_offset->type() != DataType::TYPE::INTEGER)
+		    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+		object->setIndirectOffset(((Integer*)_offset)->value());
+	    }
+	    else
+		parseType(token, object, object->dictionary().value());
+	}
+    }
+
+    void Parser::parse(const std::string& filename)
+    {
+	char buf[16];
+	std::string token;
+
+	if (fd)
+	    close(fd);
+
+	fd = open(filename.c_str(), O_RDONLY);
+	
+	if (fd <= 0)
+	    EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
+
+	// Check %PDF at startup
+	readline(fd, buf, 4);
+	if (strncmp(buf, "%PDF", 4))
+	    EXCEPTION(INVALID_HEADER, "Invalid PDF header");
+	finishLine(fd);
+
+	curOffset = lseek(fd, 0, SEEK_CUR);
+
+	// // Check %%EOF at then end
+	// lseek(fd, -5, SEEK_END);
+	// readline(fd, buf, 5);
+	// if (strncmp(buf, "%%EOF", 5))
+	//     EXCEPTION(INVALID_FOOTER, "Invalid PDF footer");
+
+	lseek(fd, curOffset, SEEK_SET);
+
+	while (1)
+	{
+	    token = nextToken(false);
+
+	    if (!token.size())
+		break;
+
+	    if (token == "xref")
+		parseXref();
+	    else if (token[0] >= '1' && token[0] <= '9')
+		parseObject(token);
+	    else
+		EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset);
+	}
+	
+	close(fd);
+    }
+
+    void Parser::writeUpdate(const std::string& filename)
+    {
+	int newFd = open(filename.c_str(), O_WRONLY|O_APPEND|O_CREAT, S_IRUSR|S_IWUSR);
+
+	if (newFd <= 0)
+	    EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
+
+	::write(newFd, "\r", 1);
+
+	std::stringstream xref;
+	int nbNewObjects = 0;
+
+	xref << std::setfill('0');
+	xref << "xref\n";
+	
+	std::vector<Object*>::iterator it;
+	for(it=_objects.begin(); it!=_objects.end(); it++)
+	{
+	    if (!(*it)->isNew())
+		continue;
+	    nbNewObjects ++;
+	    std::string objStr = (*it)->str();
+	    curOffset = lseek(newFd, 0, SEEK_CUR);
+	    ::write(newFd, objStr.c_str(), objStr.size());
+	    xref << std::setw(0) << (*it)->objectId() << " 1\n";
+	    xref << std::setw(10) << curOffset << " " << std::setw(5) << (*it)->generationNumber() << " n\r\n"; // Here \r seems important 
+	}
+
+	if (!nbNewObjects)
+	{
+	    close(newFd);
+	    return;
+	}
+
+	off_t newXrefOffset = lseek(newFd, 0, SEEK_CUR);
+
+	std::string xrefStr = xref.str();
+	::write(newFd, xrefStr.c_str(), xrefStr.size());
+
+	if (trailer.hasKey("Prev"))
+	    delete trailer["Prev"];
+	
+	trailer["Prev"] = new Integer((int)xrefOffset);
+
+	std::string trailerStr = trailer.dictionary().str();
+	::write(newFd, "trailer\n", 8);
+	::write(newFd, trailerStr.c_str(), trailerStr.size());
+
+	std::stringstream startxref;
+	startxref << "startxref\n" << newXrefOffset << "\n%%EOF";
+	
+	std::string startxrefStr = startxref.str();
+	::write(newFd, startxrefStr.c_str(), startxrefStr.size());
+	
+	close(newFd);
+    }
+    
+    void Parser::write(const std::string& filename, bool update)
+    {
+	if (update)
+	    return writeUpdate(filename);
+	else
+	    EXCEPTION(NOT_IMPLEMENTED, "Full write not implemented");
+    }
+
+}
--- a/src/uPDFTypes.cpp
+++ b/src/uPDFTypes.cpp
@@ -0,0 +1,105 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "uPDFTypes.h"
+#include "uPDFParser_common.h"
+
+namespace uPDFParser
+{
+    Name::Name(const std::string& name):
+	DataType(DataType::TYPE::NAME)
+    {
+	_value = name;
+    }
+
+    String::String(const std::string& value):
+	DataType(DataType::TYPE::STRING)
+    {
+	_value = value;
+    }
+
+    HexaString::HexaString(const std::string& value):
+	DataType(DataType::TYPE::HEXASTRING)
+    {
+	_value = value;
+    }
+
+    std::string Integer::str()
+    {
+	std::string sign("");
+	if (_signed)
+	{
+	    if (_value >= 0)
+		sign = "+";
+	    else
+		sign = "-";
+	}
+
+	return " " + sign + std::to_string(_value);
+    }
+    
+    std::string Real::str()
+    {
+	std::string sign("");
+	if (_signed)
+	{
+	    if (_value >= 0)
+		sign = "+";
+	    else
+		sign = "-";
+	}
+
+	return " " + sign + std::to_string(_value);
+    }
+
+    std::string Array::str()
+    {
+	std::string res("[");
+	std::vector<DataType*>::iterator it;
+
+	for(it = _value.begin(); it!=_value.end(); it++)
+	{
+	    if (res.size() > 1)
+		res += " ";
+	    res += (*it)->str();
+	}
+	    
+	return res + std::string("]");
+    }
+
+    void Dictionary::addData(const std::string& key, DataType* value)
+    {
+	_value[key] = value;
+    }
+    
+    std::string Dictionary::str()
+    {
+	std::string res("<<");
+	std::map<std::string, DataType*>::iterator it;
+
+	for(it = _value.begin(); it!=_value.end(); it++)
+	{
+	    res += std::string("/") + it->first;
+	    if (it->second)
+		res += it->second->str();
+	}
+	    
+	return res + std::string(">>\n"); 
+   }
+}