/* Copyright 2021 Grégory Soutadé This file is part of uPDFParser. uPDFParser is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. uPDFParser is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with uPDFParser. If not, see . */ #include #include #include #include #include #include "uPDFParser.h" #include "uPDFParser_common.h" namespace uPDFParser { std::string Object::str() { std::stringstream res; res << _objectId << " " << _generationNumber << " obj\n"; if (isIndirect()) res << " " << indirectOffset << "\n"; else { bool needLineReturn = false; if (!_dictionary.empty()) res << _dictionary.str(); else { if (!_data.size()) res << "<<>>\n"; else needLineReturn = true; } std::vector::iterator it; for(it=_data.begin(); it!=_data.end(); it++) { std::string tmp = (*it)->str(); res << tmp; if (tmp[tmp.size()-1] == '\n' || tmp[tmp.size()-1] == '\r') needLineReturn = false; } if (needLineReturn) res << "\n"; } res << "endobj\n"; return res.str(); } static DataType* tokenToNumber(std::string& token, char sign='\0') { int i; float fvalue; int ivalue; for(i=0; i<(int)token.size(); i++) { if (token[i] == '.') { if (i==0) token = std::string("0") + token; fvalue = std::stof(token); if (sign == '-') fvalue = -fvalue; return new Real(fvalue, (sign!='\0')); } } ivalue = std::stoi(token); if (sign == '-') ivalue = -ivalue; return new Integer(ivalue, (sign!='\0')); } /** * @brief Read data until '\n' or '\r' is found or buffer is full */ static inline int readline(int fd, char* buffer, int size, bool exceptionOnEOF=true) { int res = 0; char c; buffer[0] = 0; for (;size;size--,res++) { if (read(fd, &c, 1) != 1) { if (exceptionOnEOF) EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); return -1; } if (c == '\n' || c == '\r') { // Empty line if (!res) { size++ ; res--; continue; } else break; } buffer[res] = c; } if (size) buffer[res] = 0; return res; } /** * @brief Read data until EOF, '\n' or '\r' is found */ static inline void finishLine(int fd) { char c; while (1) { if (read(fd, &c, 1) != 1) break; if (c == '\n' || c == '\r') break; } // Support \r\n and \n\r if (read(fd, &c, 1) == 1) { if (c != '\n' && c != '\r') lseek(fd, -1, SEEK_CUR); } } char Parser::prevChar() { return c; } /** * @brief Find next token to analyze */ std::string Parser::nextToken(bool exceptionOnEOF, bool readComment) { char prev_c; std::string res(""); int i; static const char delims[] = " \t<>[]()/"; static const char whitespace_prev_delims[] = "+-"; // Need whitespace before static const char start_delims[] = "<>[]()"; bool found = false; c = 0; while (!found) { prev_c = c; if (read(fd, &c, 1) != 1) { if (exceptionOnEOF) EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); break; } // Comment, skip line if (c == '%') { if (readComment) { curOffset = lseek(fd, 0, SEEK_CUR)-1; res += c; while (true) { if (read(fd, &c, 1) != 1) { if (exceptionOnEOF) EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); break; } if (c == '\n' || c == '\r') break; res += c; } break; } finishLine(fd); if (res.size()) break; else continue; } // White character while empty result, continue if ((c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0') && !res.size()) continue; // Quit on line return without lseek(fd, -1, SEEK_CUR) if (c == '\n' || c == '\r') { if (res.size()) break; else continue; } if (res.size()) { // Push character until delimiter is found for (i=0; i<(int)sizeof(delims); i++) { if (c == delims[i]) { lseek(fd, -1, SEEK_CUR); found = true; break; } } // Push character until delimiter is found if (!found && prev_c == ' ') { for (i=0; i<(int)sizeof(whitespace_prev_delims); i++) { if (c == whitespace_prev_delims[i]) { lseek(fd, -1, SEEK_CUR); found = true; break; } } } if (!found) res += c; } else { curOffset = lseek(fd, 0, SEEK_CUR)-1; // First character, is it a delimiter ? for (i=0; i<(int)sizeof(start_delims); i++) { if (c == start_delims[i]) { found = true; break; } } res += c; } } // Double '>' and '<' to compute dictionary if (res == ">" || res == "<") { if (read(fd, &c, 1) == 1) { if (c == res[0]) res += c; else lseek(fd, -1, SEEK_CUR); } } return res; } void Parser::parseHeader() { char buf[5]; // Check %PDF at startup readline(fd, buf, 5, false); if (strncmp(buf, "%PDF-", 5)) EXCEPTION(INVALID_HEADER, "Invalid PDF header"); // Read major readline(fd, buf, 1, false); if (buf[0] < '0' || buf[0] > '9') EXCEPTION(INVALID_HEADER, "Invalid PDF major version " << buf[0]); version_major = buf[0] - '0'; readline(fd, buf, 1, false); if (buf[0] != '.') EXCEPTION(INVALID_HEADER, "Invalid PDF header"); // Read minor readline(fd, buf, 1, false); if (buf[0] < '0' || buf[0] > '9') EXCEPTION(INVALID_HEADER, "Invalid PDF minor version " << buf[0]); version_minor = buf[0] - '0'; finishLine(fd); curOffset = lseek(fd, 0, SEEK_CUR); } void Parser::parseStartXref() { std::string offset, token; // std::cout << "Parse startxref" << std::endl; offset = nextToken(); // XREF offset token = nextToken(false, true); // %%EOF if (strncmp(token.c_str(), "%%EOF", 5)) EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); /* Handle special case where we have : %%EOF1 0 obj\n */ if (token.size() > 5) lseek(fd, curOffset+5, SEEK_SET); /* Case where no xref table present */ if (xrefOffset == (off_t)-1) { DataType* integer = tokenToNumber(offset); if (integer->type() != DataType::TYPE::INTEGER) EXCEPTION(INVALID_TRAILER, "Invalid startxref offset"); xrefOffset = ((Integer*)integer)->value(); } } bool Parser::parseTrailer() { std::string token; // std::cout << "Parse trailer" << std::endl; token = nextToken(); if (token != "<<") EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); parseDictionary(&trailer, trailer.dictionary().value()); token = nextToken(); /* trailer without xref */ if (token != "startxref") { lseek(fd, curOffset, SEEK_SET); return false; } parseStartXref(); return true; } bool Parser::parseXref() { std::string tokens[3]; bool res = false; int curId = 0; // std::cout << "Parse xref" << std::endl; xrefOffset = curOffset; while (1) { tokens[0] = nextToken(); if (tokens[0] == "trailer") break; tokens[1] = nextToken(); // Reference ie: 0000000016 00000 n if (tokens[0].length() == 10) { tokens[2] = nextToken(); XRefValue xref(curId, std::stoi(tokens[0],0,10), std::stoi(tokens[1],0,10), (tokens[2] == "n") ? true : false); _xrefTable.push_back(xref); curId++; } // Object index ie: 0 6121 else { curId = std::stoi(tokens[0]); } } res = parseTrailer(); return res; } DataType* Parser::parseSignedNumber(std::string& token) { char sign = token[0]; token = std::string(&((token.c_str())[1])); return tokenToNumber(token, sign); } DataType* Parser::parseNumber(std::string& token) { return tokenToNumber(token); } DataType* Parser::parseNumberOrReference(std::string& token) { DataType* res = tokenToNumber(token); if (res->type() == DataType::TYPE::REAL) return res; off_t offset = lseek(fd, 0, SEEK_CUR); std::string token2 = nextToken(); std::string token3 = nextToken(); DataType* generationNumber = 0; try { generationNumber = tokenToNumber(token2); } catch (std::invalid_argument& e) { lseek(fd, offset, SEEK_SET); return res; } if ((generationNumber->type() != DataType::TYPE::INTEGER) || token3.size() != 1 || token3[0] != 'R') { delete generationNumber; lseek(fd, offset, SEEK_SET); return res; } DataType* res2 = new Reference(((Integer*)res)->value(), ((Integer*)generationNumber)->value()); delete res; return res2; } DataType* Parser::parseType(std::string& token, Object* object, std::map& dict) { DataType* value = 0; Dictionary* _value = 0; if (token == "<<") { _value = new Dictionary(); value = _value; parseDictionary(object, _value->value()); } else if (token == "[") value = parseArray(object); else if (token == "(") value = parseString(); else if (token == "<") value = parseHexaString(); else if (token == "stream") value = parseStream(object); else if (token[0] >= '1' && token[0] <= '9') value = parseNumberOrReference(token); else if (token[0] == '/') value = parseName(token); else if (token[0] == '+' || token[0] == '-') value = parseSignedNumber(token); else if (token[0] == '0' || token[0] == '.') value = parseNumber(token); else if (token == "true") return new Boolean(true); else if (token == "false") return new Boolean(false); else if (token == "null") return new Null(); else EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset); return value; } Array* Parser::parseArray(Object* object) { std::string token; DataType* value; Array* res = new Array(); while (1) { token = nextToken(); if (token == "]") break; value = parseType(token, object, object->dictionary().value()); //std::cout << "Add " << value->str() << std::endl; res->addData(value); } return res; } String* Parser::parseString() { std::string res(""); char c; bool escaped = false; int parenthesis_count = 1; /* Handle parenthesis in parenthesis */ while (1) { if (read(fd, &c, 1) != 1) break; if (c == '(' && !escaped) parenthesis_count++; else if (c == ')' && !escaped) parenthesis_count--; if (c == ')' && !escaped && parenthesis_count == 0) break; /* Handle \\ */ if (c == '\\' && escaped) escaped = false; else escaped = (c == '\\'); res += c; } return new String(res); } HexaString* Parser::parseHexaString() { std::string res(""); char c; while (1) { if (read(fd, &c, 1) != 1) break; if (c == '>') break; res += c; } if ((res.size() % 2)) EXCEPTION(INVALID_HEXASTRING, "Invalid hexa String at offset " << curOffset); return new HexaString(res); } Stream* Parser::parseStream(Object* object) { off_t startOffset, endOffset, endStream; std::string token; char c = 0; // std::cout << "parseStream" << std::endl; // Remove \n after \r if there is one if (prevChar() == '\r' && read(fd, &c, 1) == 1) { if (c != '\n') { lseek(fd, -1, SEEK_CUR); } } startOffset = lseek(fd, 0, SEEK_CUR); if (!object->hasKey("Length")) EXCEPTION(INVALID_STREAM, "No Length property at offset " << curOffset); DataType* Length = (*object)["Length"]; if (Length->type() == DataType::INTEGER) { Integer* length = (Integer*)Length; endOffset = startOffset + length->value(); lseek(fd, endOffset, SEEK_SET); token = nextToken(); if (token == "endstream") return new Stream(object->dictionary(), startOffset, endOffset, 0, 0, false, fd); // No endstream, come back at the begining lseek(fd, startOffset, SEEK_SET); } // Don't want to parse xref table... while (1) { char buffer[4*1024]; char* subs, c; int ret; ret = read(fd, buffer, sizeof(buffer)); if (ret <= 0) EXCEPTION(TRUNCATED_FILE, "Unexpected end of file"); subs = (char*)memmem((void*)buffer, ret, (void*)"endstream", 9); if (subs) { unsigned long pos = (unsigned long)subs - (unsigned long)buffer; // Here we're juste before "enstream" endOffset = lseek(fd, -(ret-pos), SEEK_CUR); // Final position must be after endstream\n endStream = endOffset + 10; // Remove trailing \r and \n before endstream for (;endOffset > startOffset; endOffset--) { lseek(fd, -1, SEEK_CUR); ret = read(fd, &c, 1); if (ret <= 0) break; if (c == '\r') { lseek(fd, -1, SEEK_CUR); continue; } else if (c == '\n') { lseek(fd, -1, SEEK_CUR); } break; } // Adjust final position lseek(fd, endStream, SEEK_SET); break; } } return new Stream(object->dictionary(), startOffset, endOffset, 0, 0, false, fd); } Name* Parser::parseName(std::string& name) { if (!name.size() || name[0] != '/') EXCEPTION(INVALID_NAME, "Invalid Name at offset " << curOffset); //std::cout << "Name " << name << std::endl; return new Name(name); } void Parser::parseDictionary(Object* object, std::map& dict) { std::string token; Name* key; DataType* value; while (1) { token = nextToken(); if (token == ">>") break; key = parseName(token); token = nextToken(); if (token == ">>") { dict[key->value()] = 0; break; } value = parseType(token, object, dict); dict[key->value()] = value; } } void Parser::parseObject(std::string& token) { off_t offset; int objectId, generationNumber; Object* object; offset = curOffset; try { objectId = std::stoi(token); token = nextToken(); generationNumber = std::stoi(token); } catch(std::invalid_argument& e) { EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); } token = nextToken(); if (token != "obj") EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); // std::cout << "New obj " << objectId << " " << generationNumber << std::endl; object = new Object(objectId, generationNumber, offset); _objects.push_back(object); std::vector& datas = object->data(); while (1) { token = nextToken(); if (token == "endobj") break; if (token == "<<") parseDictionary(object, object->dictionary().value()); else if (token[0] >= '1' && token[0] <= '9') { DataType* _offset = tokenToNumber(token); if (_offset->type() != DataType::TYPE::INTEGER) EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset); object->setIndirectOffset(((Integer*)_offset)->value()); } else { DataType* res = parseType(token, object, object->dictionary().value()); datas.push_back(res); } } // Keep a reference to last xrefObject if (object->hasKey("Type") && (*object)["Type"]->str() == "/XRef") xrefObject = object; } void Parser::parse(const std::string& filename) { std::string token; bool secondLine = true; if (fd) close(fd); fd = open(filename.c_str(), O_RDONLY); if (fd <= 0) EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); parseHeader(); // // Check %%EOF at then end // lseek(fd, -5, SEEK_END); // readline(fd, buf, 5); // if (strncmp(buf, "%%EOF", 5)) // EXCEPTION(INVALID_FOOTER, "Invalid PDF footer"); lseek(fd, curOffset, SEEK_SET); while (1) { token = nextToken(false); if (!token.size()) break; if (token == "xref") parseXref(); else if (token[0] >= '1' && token[0] <= '9') parseObject(token); // Can have startxref without trailer (not end of document) else if (token == "startxref") parseStartXref(); else { // The second line may be not commented and invalid (for UTF8 stuff) if (!secondLine) { EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset); } else finishLine(fd); } // If for optimization if (secondLine) secondLine = false; } // Synchronize xref table with parsed objects std::vector::iterator it; for (it=_xrefTable.begin(); it != _xrefTable.end(); it++) { Object* object = getObject((*it).objectId(), (*it).generationNumber()); if (object) { (*it).setObject(object); object->setUsed((*it).used()); } } repairTrailer(); // close(fd); } Object* Parser::getObject(int objectId, int generationNumber) { std::vector::iterator it; Object object(objectId, generationNumber, 0); for (it = _objects.begin(); it != _objects.end(); it++) { if (**it == object) return *it; } return 0; } void Parser::repairTrailer() { // Try to fill manadatory values not present in original trailer // with xrefObject if there is one if (!xrefObject) return; static const char* keys[] = {"Root", "Info", "Encrypt", "ID"}; for (int i=0; i<(int)(sizeof(keys)/sizeof(keys[0])); i++) { if (!trailer.hasKey(keys[i]) && xrefObject->hasKey(keys[i])) trailer.dictionary().addData(keys[i], (*xrefObject)[keys[i]]->clone()); } } void Parser::removeObject(Object* object) { std::vector::iterator it; for(it = _objects.begin(); it != _objects.end(); it++) { if (**it == *object) { delete *it; _objects.erase(it); break; } } } void Parser::writeBuffer(int fd, const char* buffer, int size) { int ret; do { ret = ::write(fd, buffer, size); if (ret >= 0) { size -= ret; buffer += ret; } else { EXCEPTION(IO_ERROR, "IO Error (write) %m"); } } while (size); } void Parser::writeUpdate(const std::string& filename) { struct stat _stat; int statRet = stat(filename.c_str(), &_stat); int newFd = open(filename.c_str(), O_WRONLY|O_APPEND|O_CREAT, S_IRUSR|S_IWUSR); if (newFd <= 0) EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); // Copy file if it doesn't exists if (statRet == -1 && errno == ENOENT) { char buffer[4096]; int ret; lseek(fd, 0, SEEK_SET); while (true) { ret = ::read(fd, buffer, sizeof(buffer)); if (ret <= 0) break; writeBuffer(newFd, buffer, ret); } } writeBuffer(newFd, "\r", 1); int maxId = 0; std::stringstream xref; int nbNewObjects = 0; xref << std::setfill('0'); xref << "xref\n"; std::vector::iterator it; for(it=_objects.begin(); it!=_objects.end(); it++) { Object* object = *it; if (object->objectId() > maxId) maxId = object->objectId(); if (!object->isNew()) continue; nbNewObjects ++; std::string objStr = object->str(); curOffset = lseek(newFd, 0, SEEK_CUR); writeBuffer(newFd, objStr.c_str(), objStr.size()); xref << std::setw(0) << object->objectId() << " 1\n"; xref << std::setw(10) << curOffset << " " << std::setw(5) << object->generationNumber() << " n\r\n"; // Here \r seems important } if (!nbNewObjects) { close(newFd); return; } off_t newXrefOffset = lseek(newFd, 0, SEEK_CUR); std::string xrefStr = xref.str(); writeBuffer(newFd, xrefStr.c_str(), xrefStr.size()); trailer.deleteKey("Prev"); if (xrefOffset != (off_t)-1) trailer.dictionary().addData("Prev", new Integer((int)xrefOffset)); trailer.deleteKey("Size"); trailer.dictionary().addData("Size", new Integer(maxId+1)); std::string trailerStr = trailer.dictionary().str(); writeBuffer(newFd, "trailer\n", 8); writeBuffer(newFd, trailerStr.c_str(), trailerStr.size()); std::stringstream startxref; startxref << "startxref\n" << newXrefOffset << "\n%%EOF"; std::string startxrefStr = startxref.str(); writeBuffer(newFd, startxrefStr.c_str(), startxrefStr.size()); close(newFd); } void Parser::write(const std::string& filename, bool update) { if (update) return writeUpdate(filename); int newFd = open(filename.c_str(), O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); if (newFd <= 0) EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); char header[18]; int ret = snprintf(header, sizeof(header), "%%PDF-%d.%d\r%%%c%c%c%c\r\n", version_major, version_minor, 0xe2, 0xe3, 0xcf, 0xd3); writeBuffer(newFd, header, ret); int maxId = 0; std::stringstream xref; off_t xrefStmOffset = 0; xref << std::setfill('0'); xref << "xref\n"; xref << "0 1\n"; xref << "0000000000 65535 f\r\n"; std::vector::iterator it; for(it=_objects.begin(); it!=_objects.end(); it++) { Object* object = *it; std::string objStr = object->str(); curOffset = lseek(newFd, 0, SEEK_CUR); writeBuffer(newFd, objStr.c_str(), objStr.size()); xref << std::setw(0) << object->objectId() << " 1\n"; xref << std::setw(10) << curOffset << " " << std::setw(5) << object->generationNumber(); if (object->used()) xref << " n"; else xref << " f" ; xref << "\r\n" ; // Here \r seems important if (object->objectId() > maxId) maxId = object->objectId(); if (object->hasKey("Type") && (*object)["Type"]->str() == "/XRef") { // Try to keep Prev link valid if (object->hasKey("Prev") && xrefStmOffset != 0) { object->deleteKey("Prev"); object->dictionary().addData("Prev", new Integer(xrefStmOffset)); } xrefStmOffset = curOffset; } } off_t newXrefOffset = lseek(newFd, 0, SEEK_CUR); std::string xrefStr = xref.str(); writeBuffer(newFd, xrefStr.c_str(), xrefStr.size()); trailer.deleteKey("Prev"); trailer.deleteKey("Size"); trailer.dictionary().addData("Size", new Integer(maxId+1)); trailer.deleteKey("XRefStm"); if (xrefStmOffset != 0) trailer.dictionary().addData("XRefStm", new Integer(xrefStmOffset)); std::string trailerStr = trailer.dictionary().str(); writeBuffer(newFd, "trailer\n", 8); writeBuffer(newFd, trailerStr.c_str(), trailerStr.size()); std::stringstream startxref; startxref << "startxref\n" << newXrefOffset << "\n%%EOF"; std::string startxrefStr = startxref.str(); writeBuffer(newFd, startxrefStr.c_str(), startxrefStr.size()); close(newFd); } }