1#@!#!123s

: / proc / thread-self / root / proc / thread-self / root / usr / share / ghostscript / Resource / Init /
Filename : pdf_rbld.ps
back
% Copyright (C) 2001-2019 Artifex Software, Inc.
% All Rights Reserved.
%
% This software is provided AS-IS with no warranty, either express or
% implied.
%
% This software is distributed under license and may not be copied,
% modified or distributed except as expressly authorized under the terms
% of the license contained in the file LICENSE in this distribution.
%
% Refer to licensing information at http://www.artifex.com or contact
% Artifex Software, Inc.,  1305 Grant Avenue - Suite 200, Novato,
% CA 94945, U.S.A., +1(415)492-9861, for further information.
%

% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)

% This module contains routines that are used if we detect an error
% while reading the xref tables.  These routines will scan the file and
% build an xref table by finding the objects.  We also need to find the
% appropriate trailer dictionary.  Note:  One procedure is also used
% even if we do not need to rebuild a PDF file.
%
% This module cannot rebuild a PDF file which has had errors created inside
% of objects or binary data streams.  It often succeeds with files that
% have had its end of lines converted between unix and dos versions.

% if true --> we have an object with duplicate object and generation numbers.
/dup_obj_gen_num //false def

% Note:  This procedure is also used by non-rebuild code.
% Store a line in the xref array (Actually Objects and Generations arrays)
% <obj num> (strm num> <obj loc> <gen num> <rebuild>
%                         setxrefentry <obj num> strm num> <obj loc> <gen num>
/setxrefentry
{
  5 1 roll
  dup 65535 or 65535 ne {
    (   **** Error:  Generation number out of 0..65535 range, assuming 0.\n)
    pdfformaterror
    (                Output may be incorrect.\n) pdfformaterror
    pop 0
  } if
        % We store generation numbers as value + 1
        % We reserve 0 to indicate an free xref entry
  1 add			% increment generation number
        % To save space, generations numbers are stored in a string unless we
        % find a generation number greater than 255.  If so then transfer to
        % an array.
  dup 255 gt {
    Generations type /stringtype eq {	% Convert Generations to an array.
      Generations length array dup	% Create new array
      0 1 2 index length 1 sub {	% Copy from old string to new array
        Generations 1 index get put dup
      } for
      pop
      /Generations exch store		% Save new Generations array
    } if
  } if
        % Verify that the new values are for a new object.  If the current
        % entry is null then we have a new entry.
  Objects 4 index get //null eq {
    ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
    Objects 4 index 3 index cvx put	% Save object location
    Generations 4 index 2 index put	% Save geenration number
  } {
        % Verify that the new entry has at least as high a generaton number
        % We accept equal entry number because we have found PDF files in
        % which there are multiple objects with the same object and entry
        % numbers.  The normal xref logic only accepts the first such
        % entry that it finds.  However the 'rebuild PDF' logic can find
        % both such entries.  The correct one is usually the last one.
    Generations 4 index get 1 index le {
      %% Check if the object we already found was at the locaton specified
      %% in the original xref (now stored in Orig_Objects). If so, prefer
      %% that offset, otherwise prefer the later object.
      %% NB check first to see that the object number is in the range of the original
      %% xref. If it isn't, set the 'original' Offset to 0.
      3 index Orig_Objects length le {Orig_Objects 4 index get}{0}ifelse
      Objects 5 index get eq not
      {
        ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
        Objects 4 index 3 index cvx put	% Save object location
        Generations 4 index 2 index put	% Save geenration number
      } if
    } if
        % Set error flag if we have equal object and generation numbers
    Generations 4 index get 1 index eq { /dup_obj_gen_num //true def } if
  } 8 -1 roll { ifelse } { pop if } ifelse  % Run 'else' only when rebuilding.
} bind executeonly def

% Print the contents of the xref array.  This actually consists of three
% arrays (Objects, Generations, and ObjectStream).
/print_xref				% - print_xref -
{ 0 1 Objects length 1 sub		% stack: 0 1 <number of objects - 1>
  { dup =only				% print object number
    (  ) print
    dup Generations exch get 1 sub =only % print Generation number
    (  ) print
    dup ObjectStream exch get ==only	% print ObjectStream object number
    (  ) print
    Objects exch get ===		% print object location
  } for
  flush
} bind executeonly def

% Get token from string and check its type
%   <string> <type> typed_token <false>		% no token or not match
%   <string> <type> typed_token <obj> <last> <true>	% matching token type
% Where last is the string remainder
/typed_token
{ exch
  token_nofail			% get token
  {
    dup type			% stack:  type last token type
    4 -1 roll eq {		% stack:  last token bool
      exch //true		% desired object found - set exit status
    } {
      pop pop //false		% not type - clear stack, set exit status
    } ifelse
  } {
    pop //false			% no token - pop type, set exit status
  } ifelse			% check if we got token
} bind executeonly def

% Allocate space for post_eof_count to be bound into procedures below.
/post_eof_count 0 def

% We want the location of the trailer dictionary at the start of file.
% First we will find the xref.  Then we will skip over the xref entries
% to the trailer.
/search_start_trailer		% - search_start_trailer <trailer loc>
{ % Read the first 300 bytes and check for xref
  PDFfile 0 setfileposition
  PDFfile bytesavailable post_eof_count sub	% location of end of data
  300 .min			% block size to read
  dup string 0 1 4 -1 roll 1 sub
  { 2 copy PDFfile read pop put pop } for
  (xref) search {
    % found 'xref'
    exch pop exch pop length 4 add PDFfile exch setfileposition
    PDFfile token pop		% get starting entry - or 'trailer'
    (trailer) ne {		% if we do not already have 'trailer'
      PDFfile token pop		% get number of entries
      PDFfile token pop pop	% this moves us into the middle of the first entry
      25 string exch		% define working string for readline
      { PDFfile 1 index readline pop pop
      } repeat			% skip entries
      pop			% pop working string
      PDFfile token pop pop	% get 'trailer'
      PDFfile fileposition	% get file position
    } if
  } {
    pop 0                 	% no xref, should not happen, report it upstrem
  } ifelse
} bind executeonly def


%% Searches backwards from a specified point looking for a 'trailer' keyword.
%%      position search_earlier_trailer position or 0
%%
%% Its is just possible that a 'trailer' keyword could straddle a buffer, in which case
%% we wouldn't find it. Given that this only executes for broken files anyway I don't
%% propose to worry about it at the moment, if anyone ever turns up an example we may
%% choose to enhance this routine further.
%%
/search_earlier_trailer {
  {                                             % position
    dup 0 gt {                                  % position bool
      dup 65535 .min exch                       % block_size position
      1 index sub                               % block_size position-block_size
      dup                                       % block_size new_position new_position
      PDFfile exch setfileposition              % block_size new position
      exch dup                                  % new position block_size block_size
      dup string 0 1 4 -1 roll 1 sub            %
      {2 copy PDFfile read pop put pop } for    %
                                                % new_position block size (...string from file....)
      (trailer) search {
        pop
        { search not { exit } if pop } loop
                                                % determine where the trailer is in the file
                                                %   trailer loc = end loc - remaing string length
        length                                  % new_position block size string length
        3 1 roll                                % string length new_position block size
        add exch sub                            % string length - (new_position + block size)
      } {
        pop                                     % discard old block size
        pop 0
      } ifelse
    } {
      pop 0 exit
    }ifelse

    %% We either have a position for a trailer, or 0 if we failed to find one
    dup 0 eq not {
      exit
    }if
    pop                                         % renove the zero leaving the new start position
  } loop
} bind executeonly def

% We want the location of the trailer dictionary at the end of file.
% We will read the last block of data and search for the final occurance
% of the word 'trailer'
/search_end_trailer		% - search_end_trailer <trailer loc>
{ % Position to read block of data from the end of the file.  Note:  We ignore
  % anything past the last %%EOF since this is not PDF data.
  PDFfile 0 setfileposition
  PDFfile bytesavailable post_eof_count sub	% location of end of data
  dup 65535 .min		% block size to read
                                % stack: <file end pos> <block size>
  % move file position to the start of the block
  2 copy sub PDFfile exch setfileposition
  % read block of data
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  % search for last occurance of 'trailer'
  (trailer) search {
    pop
    { search not { exit } if pop } loop
    % determine where the trailer is in the file
    %   trailer loc = end loc - remaing string length
    length sub
  } {
    pop pop 0
  } ifelse
} bind executeonly def

% We want to find the trailer dictionary.  There is a trailer dictionary
% for each xref object list.  We only want the trailer dictionary associated
% with the first xref object list.  In theory this can be anywhere in the
% file.  However since we are trying to repair a broken file, we cannot simply
% follow the xref links.  So we are falling back to a simple strategy.  We
% find the specified location of the first xref list.  If its location is in
% the first half of the file then we search for the first trailer dictionary
% at the start of the file.  Otherwise we search for the last trailer at the
% end of the file.
/search_trailer			% - search_trailer -
{ % Find the 'startxref' and associated position at the end of the file.
  % Position to read block of data from the end of the file.  Note:  We
  % actually end at the end of the last %%EOF since this is the end of the
  % useful PDF data.  (Some files contain trailing garbage.)
  PDFfile 0 setfileposition
  PDFfile bytesavailable	% size of file
  post_eof_count sub dup	% location of end of last %%EOF
  dup 4096 .min			% block size to read
  % stack: <useful file size> <useful file size file> <block size>
  % move file position to the start of the block
  2 copy sub PDFfile exch setfileposition
  % read block of data
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  % search for last occurance of 'startxref'
  //false				% Assume that startxref not present
  exch (startxref) {
     search not { exit } if	% Exit loop when no more startxref's
     pop 3 -1 roll pop //true 3 1 roll	% Idicate that we have found starxref
  } loop
  exch				% Exch last string and 'found' flag
  {
     % determine where the startxref is in the file
     %   'startxref' loc = end loc - remaing string length - 9 bytes
     length sub 9 sub
     % move the file to this position and read startxref and position
     PDFfile exch setfileposition PDFfile token
     pop pop PDFfile token_no_close pop
     dup type /integertype eq not {
       pop
       % startxref not followed by integer.  We will search the end of the file for trailer.
       PDFfilelen
     } if
  } {
     % startxref not found.  We will search the end of the file for trailer.
     pop pop PDFfilelen
  } ifelse
  % compare xref position to 1/2 the length of the file and search for trailer
  exch 2 div lt {
    search_start_trailer dup 0 eq { pop search_end_trailer } if
  } {
    search_end_trailer   dup 0 eq { pop search_start_trailer } if
  } ifelse
  dup 0 eq {
    pop
    (   **** Error:  Trailer dictionary not found.\n) pdfformaterror
    (                Output may be incorrect.\n) pdfformaterror
  }{
  % get the trailer
  dup
  PDFfile exch setfileposition		% set to the specified trailer location
  /dictlevelcount 0 def
  PDFfile traileropdict .pdfrun		% read trailer info

  {
    dup /Root known not {
     (   **** Warning: This trailer dictionary does not contain a /Root entry\n        searching for a prior trailer.\n) pdfformatwarning
      %% remove trailer dict copy, duplicate file position, then remove the length of 'trailer' so we don't
      %% find the same one again....
      pop dup 7 sub
      search_earlier_trailer
    }{exit}ifelse
    dup 0 eq {
    pop
    (   **** Error:  Valid trailer not found.\n) pdfformaterror
    (                Output may be incorrect.\n) pdfformaterror
    }{
      PDFfile exch setfileposition		% set to the specified trailer location
      /dictlevelcount 0 def
      PDFfile traileropdict .pdfrun		% read trailer info
    } ifelse
  } loop
  /Trailer exch def
  pop
  } ifelse
} bind executeonly def

% This routine will determine if there is stuff after the %%EOF.  There is
% supposed to be only a line termination.  However many real life files
% contain some garbage.  This routine checks how much.  We then ignore this
% stuff when we are scanning for objects.
/determine_post_eof_count		% - determine_post_eof_count <count>
{ % Position to read block of data from the end of the file.
  PDFfilelen			% size of file
  dup 4096 .min	                % file_size block_size
  dup 3 1 roll sub              % block_size file_size-block_size
  PDFfile exch setfileposition  % block_size
  string PDFfile exch readstring pop % ()

  % search for last occurance of 'startxref', '%%EOF' is often damaged
  (startxref) search {
    pop
    { search not { exit } if pop
    } loop
    % how much is left = remaining string length

    % Now search for %%EO or try to read a number after 'startxref'.
    (%%EO) search {
      pop pop
    } {
      % Look for a number after startxref
      { dup token { pop exch pop } if
      } stopped pop
    } ifelse
    length
  } {
    % Can't even find startxref, assume it's all objects
    pop 0
  } ifelse
} bind executeonly def

% This routine will scan a file searaching for object locations to build
% an alternate version of the data in the xref tables.
% Its purpose is to provide a basis for an xref fixing facility.
/search_objects				% - search_objects -
{ % Initialize the Objects, Generations, etc. arrays
  Objects dup length array copy /Orig_Objects exch def
  initPDFobjects
  % reset duplicate object and generation numbers error flag
  /dup_obj_gen_num //false def
  % Determine how many bytes are in the file after the final %%EOF
  /post_eof_count determine_post_eof_count def
  % Start at the beginning of the file
  PDFfile 0 setfileposition
  % Create a working string (and also store its length on stack).  We are
  % using a maximum size string size the logic below wants a recovered object
  % to fit into our working string.
  65535 dup string
  { % Now loop through the entire file looking for objects
    PDFfile fileposition		% save current file position
    % When we get near the end of the file, we use a smaller interval of
    % our working string to prevent reading past the end.  (See comments on
    % EOF testing below.)
    PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
      2 index 0 3 -1 roll getinterval	% near EOF, use interval of string
    } {
      pop 1 index			% not near end, use full working string
    } ifelse
    % Read a line from file.  If the line does not fit into our working string,
    % or any other error, then we will discard it.
    PDFfile exch { readline } //.internalstopped exec
    { pop pop //false } if		% indicate no string if we stopped
    { % stack: <length> <working_str> <loc> <string>
      % Now that we have line, get obj num, ref num, and 'obj'.  Verify that each
      % of these is correct type.
      dup (obj) search {                % preliminary check for obj
        pop pop pop
        /integertype typed_token {	% get obj number
          /integertype typed_token {	% get ref number
            /nametype typed_token {	% get 'obj' text
              pop			% pop remaining string
              /obj eq {			% verify name is 'obj'
                % make sure we have room in the arrays.  We work in increments
                % of 20 each time we increase the size.
                1 index 20 add 20 idiv 20 mul
                growPDFobjects
                % save xref parameters into ObjectStream, Objects and Generations
                1 index 0		% rearrange parms for setxrefentry
                4 index PDFoffset sub 3 index
                //true setxrefentry	% save parameters
                pop pop pop pop		% clear parameters
              } if			% check if name is 'obj'
            } if			% check if we got 'obj" string
            pop				% remove ref number
          } if				% check if we got ref number
          pop				% remove obj number
        } if				% check if we got object number
      } {
        pop pop
      } ifelse
    } if				% check if got a string from readline
    pop					% remove location
    % Check if we are approaching the end of the file.  We do not want to
    % read past the end of the file since that closes it.  We actually stop
    % 10-20 bytes early since there cannot be an object that close to the end.
    % (There is a Trailer dictionary, etc. at the end of the file.)
    PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
  } loop				% loop through the entire file
  pop pop				% remove working string and its length
  % Output warning if we have two objects with the same object and generation
  % numbers.
  dup_obj_gen_num {
    (   **** Warning:  There are objects with matching object and generation\n)
    pdfformatwarning
    (   **** numbers.  The output may be incorrect.\n)
    pdfformatwarning
  } if
  currentdict /Orig_Objects undef
} bind executeonly def

% Print warning message because we found a problem while reading the xref
% tables
/print_xref_warning
{ (   **** Error:  An error occurred while reading an XREF table.\n)
  pdfformaterror
  (   **** The file has been damaged.  This may have been caused\n)
  pdfformaterror
  (   **** by a problem while converting or transfering the file.\n)
  pdfformaterror
  (   **** Ghostscript will attempt to recover the data.\n)
  pdfformaterror
  (   **** However, the output may be incorrect.\n) pdfformaterror
} bind executeonly def

% Attempt to recover the XRef data.  This is called if we have a failure
% while reading the normal XRef tables.  This routine usually works
% only for pre PDF1.5 versions of PDF files.
/recover_xref_data		% - recover_xref_data -
{ print_xref_warning		% Print warning message
  count pdfemptycount sub { pop } repeat % remove anything left by readxref
  search_objects		% Search for objects
} bind executeonly def