Multiple improvments :
* nextToken() can return comments if needed (used for %%EOF) * Handle object after %%EOF without line return * Handle trailer without xref * Handle parenthesis in string objects * Fix error on stream parsing (fallback to big chunk read)
This commit is contained in:
parent
7f2a84c10c
commit
ea551b6f52
|
@ -81,10 +81,10 @@ namespace uPDFParser
|
||||||
private:
|
private:
|
||||||
void parseObject(std::string& token);
|
void parseObject(std::string& token);
|
||||||
void parseStartXref();
|
void parseStartXref();
|
||||||
void parseXref();
|
bool parseXref();
|
||||||
void parseTrailer();
|
bool parseTrailer();
|
||||||
|
|
||||||
std::string nextToken(bool exceptionOnEOF=true);
|
std::string nextToken(bool exceptionOnEOF=true, bool readComment=false);
|
||||||
|
|
||||||
DataType* parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict);
|
DataType* parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict);
|
||||||
void parseDictionary(Object* object, std::map<std::string, DataType*>& dict);
|
void parseDictionary(Object* object, std::map<std::string, DataType*>& dict);
|
||||||
|
|
|
@ -111,7 +111,7 @@ namespace uPDFParser
|
||||||
/**
|
/**
|
||||||
* @brief Find next token to analyze
|
* @brief Find next token to analyze
|
||||||
*/
|
*/
|
||||||
std::string Parser::nextToken(bool exceptionOnEOF)
|
std::string Parser::nextToken(bool exceptionOnEOF, bool readComment)
|
||||||
{
|
{
|
||||||
char c = 0, prev_c;
|
char c = 0, prev_c;
|
||||||
std::string res("");
|
std::string res("");
|
||||||
|
@ -134,8 +134,30 @@ namespace uPDFParser
|
||||||
// Comment, skip line
|
// Comment, skip line
|
||||||
if (c == '%')
|
if (c == '%')
|
||||||
{
|
{
|
||||||
|
if (readComment)
|
||||||
|
{
|
||||||
|
curOffset = lseek(fd, 0, SEEK_CUR)-1;
|
||||||
|
res += c;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
if (read(fd, &c, 1) != 1)
|
||||||
|
{
|
||||||
|
if (exceptionOnEOF)
|
||||||
|
EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (c == '\n' || c == '\r')
|
||||||
|
break;
|
||||||
|
res += c;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
finishLine(fd);
|
finishLine(fd);
|
||||||
break;
|
if (res.size())
|
||||||
|
break;
|
||||||
|
else
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// White character while empty result, continue
|
// White character while empty result, continue
|
||||||
|
@ -217,17 +239,22 @@ namespace uPDFParser
|
||||||
void Parser::parseStartXref()
|
void Parser::parseStartXref()
|
||||||
{
|
{
|
||||||
std::string token;
|
std::string token;
|
||||||
char buffer[10];
|
|
||||||
|
|
||||||
// std::cout << "Parse startxref" << std::endl;
|
// std::cout << "Parse startxref" << std::endl;
|
||||||
|
|
||||||
token = nextToken();
|
token = nextToken(); // XREF offset
|
||||||
readline(fd, buffer, sizeof(buffer), false);
|
token = nextToken(false, true); // %%EOF
|
||||||
if (strncmp(buffer, "%%EOF", 5))
|
if (strncmp(token.c_str(), "%%EOF", 5))
|
||||||
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
|
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
|
||||||
|
/*
|
||||||
|
Handle special case where we have :
|
||||||
|
%%EOF1 0 obj\n
|
||||||
|
*/
|
||||||
|
if (token.size() > 5)
|
||||||
|
lseek(fd, curOffset+5, SEEK_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Parser::parseTrailer()
|
bool Parser::parseTrailer()
|
||||||
{
|
{
|
||||||
std::string token;
|
std::string token;
|
||||||
|
|
||||||
|
@ -241,16 +268,22 @@ namespace uPDFParser
|
||||||
parseDictionary(&trailer, trailer.dictionary().value());
|
parseDictionary(&trailer, trailer.dictionary().value());
|
||||||
|
|
||||||
token = nextToken();
|
token = nextToken();
|
||||||
|
/* trailer without xref */
|
||||||
if (token != "startxref")
|
if (token != "startxref")
|
||||||
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
|
{
|
||||||
|
lseek(fd, curOffset, SEEK_SET);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
parseStartXref();
|
parseStartXref();
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Parser::parseXref()
|
bool Parser::parseXref()
|
||||||
{
|
{
|
||||||
std::string token;
|
std::string token;
|
||||||
|
bool res = false;
|
||||||
|
|
||||||
// std::cout << "Parse xref" << std::endl;
|
// std::cout << "Parse xref" << std::endl;
|
||||||
xrefOffset = curOffset;
|
xrefOffset = curOffset;
|
||||||
|
|
||||||
|
@ -260,10 +293,12 @@ namespace uPDFParser
|
||||||
|
|
||||||
if (token == "trailer")
|
if (token == "trailer")
|
||||||
{
|
{
|
||||||
parseTrailer();
|
res = parseTrailer();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DataType* tokenToNumber(std::string& token, char sign='\0')
|
static DataType* tokenToNumber(std::string& token, char sign='\0')
|
||||||
|
@ -405,16 +440,26 @@ namespace uPDFParser
|
||||||
std::string res("");
|
std::string res("");
|
||||||
char c;
|
char c;
|
||||||
bool escaped = false;
|
bool escaped = false;
|
||||||
|
int parenthesis_count = 1; /* Handle parenthesis in parenthesis */
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
if (read(fd, &c, 1) != 1)
|
if (read(fd, &c, 1) != 1)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (c == ')' && !escaped)
|
if (c == '(' && !escaped)
|
||||||
|
parenthesis_count++;
|
||||||
|
else if (c == ')' && !escaped)
|
||||||
|
parenthesis_count--;
|
||||||
|
|
||||||
|
if (c == ')' && !escaped && parenthesis_count == 0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
escaped = (c == '\\');
|
/* Handle \\ */
|
||||||
|
if (c == '\\' && escaped)
|
||||||
|
escaped = false;
|
||||||
|
else
|
||||||
|
escaped = (c == '\\');
|
||||||
|
|
||||||
res += c;
|
res += c;
|
||||||
}
|
}
|
||||||
|
@ -457,37 +502,37 @@ namespace uPDFParser
|
||||||
EXCEPTION(INVALID_STREAM, "No Length property at offset " << curOffset);
|
EXCEPTION(INVALID_STREAM, "No Length property at offset " << curOffset);
|
||||||
|
|
||||||
DataType* Length = (*object)["Length"];
|
DataType* Length = (*object)["Length"];
|
||||||
if (Length->type() != DataType::INTEGER)
|
// Try with a direct jump if no filter applied (Flatedecode)
|
||||||
|
if (!object->hasKey("Filter") && Length->type() == DataType::INTEGER)
|
||||||
{
|
{
|
||||||
if (Length->type() != DataType::REFERENCE)
|
Integer* length = (Integer*)Length;
|
||||||
EXCEPTION(INVALID_STREAM, "Invalid Length property at offset " << curOffset);
|
endOffset = startOffset + length->value();
|
||||||
|
lseek(fd, endOffset, SEEK_SET);
|
||||||
|
token = nextToken();
|
||||||
|
|
||||||
// Don't want to parse xref table...
|
if (token == "endstream")
|
||||||
while (1)
|
return new Stream(startOffset, endOffset);
|
||||||
{
|
|
||||||
char buffer[4*1024];
|
// No endstream, come back at the begining
|
||||||
int ret;
|
lseek(fd, startOffset, SEEK_SET);
|
||||||
endOffset = lseek(fd, 0, SEEK_CUR);
|
|
||||||
ret = readline(fd, buffer, sizeof(buffer));
|
|
||||||
if (!strncmp(buffer, "endstream", 9))
|
|
||||||
{
|
|
||||||
lseek(fd, -(ret-9), SEEK_CUR);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new Stream(startOffset, endOffset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Integer* length = (Integer*)Length;
|
// Don't want to parse xref table...
|
||||||
endOffset = startOffset + length->value();
|
while (1)
|
||||||
lseek(fd, endOffset, SEEK_SET);
|
{
|
||||||
token = nextToken();
|
char buffer[4*1024];
|
||||||
|
char* subs;
|
||||||
if (token != "endstream")
|
int ret;
|
||||||
EXCEPTION(INVALID_STREAM, "endstream not found at offset " << endOffset);
|
ret = readline(fd, buffer, sizeof(buffer));
|
||||||
|
subs = (char*)memmem((void*)buffer, ret, (void*)"endstream", 9);
|
||||||
// std::cout << "end parseStream" << std::endl;
|
if (subs)
|
||||||
|
{
|
||||||
|
unsigned long pos = (unsigned long)subs - (unsigned long)buffer;
|
||||||
|
lseek(fd, -(ret-pos-9), SEEK_CUR);
|
||||||
|
endOffset = lseek(fd, 0, SEEK_CUR);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
return new Stream(startOffset, endOffset);
|
return new Stream(startOffset, endOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -553,7 +598,7 @@ namespace uPDFParser
|
||||||
|
|
||||||
object = new Object(objectId, generationNumber, offset);
|
object = new Object(objectId, generationNumber, offset);
|
||||||
_objects.push_back(object);
|
_objects.push_back(object);
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
token = nextToken();
|
token = nextToken();
|
||||||
|
|
Loading…
Reference in New Issue
Block a user