SetaPDF-Core - Detect colors in PDF files
With the use of the Core system it is possible to walk through a PDF document at its lowest level and analyse its internal structure. In this demo we will show you how to collect Information about used colors and color spaces in a PDF document with PHP.
The demo make use of 2 individual classes:
StreamProcessor.php
PHP
<?php /** * Class StreamProcessor * * This class offer the desired callback methods for the content stream parser */ class StreamProcessor { /** * @var \ColorInspector */ protected $_colorInspector; /** * @var \SetaPDF_Core_Canvas */ protected $_canvas; /** * @var \SetaPDF_Core_Parser_Content */ protected $_parser; /** * The constructor * * @param \SetaPDF_Core_Canvas $canvas * @param \ColorInspector $colorInspector */ public function __construct(\SetaPDF_Core_Canvas $canvas, \ColorInspector $colorInspector) { $this->_canvas = $canvas; $this->_colorInspector = $colorInspector; } /** * Callback for standard color operators * * @param array $args * @param string $operator */ public function _color(array $args, $operator) { $color = \SetaPDF_Core_DataStructure_Color::createByComponents($args); $info = 'Standard color operator (' . $operator . ') in content stream.'; switch (true) { case $color instanceof \SetaPDF_Core_DataStructure_Color_Rgb: $this->_colorInspector->addFoundColor('DeviceRGB', $color, $info); return; case $color instanceof \SetaPDF_Core_DataStructure_Color_Gray: $this->_colorInspector->addFoundColor('DeviceGray', $color, $info); return; case $color instanceof \SetaPDF_Core_DataStructure_Color_Cmyk: $this->_colorInspector->addFoundColor('DeviceCMYK', $color, $info); return; } } /** * Callback for color space operators * * @param array $args * @param string $operator */ public function _colorSpace(array $args, $operator) { $colorSpace = $args[0]; $colorSpaces = $this->_canvas->getResources(true, false, \SetaPDF_Core_Resource::TYPE_COLOR_SPACE); if ($colorSpaces && $colorSpaces->offsetExists($colorSpace->getValue())) { $colorSpace = $colorSpaces->getValue($colorSpace->getValue()); } $colorSpace = \SetaPDF_Core_ColorSpace::createByDefinition($colorSpace); $info = 'Color space operator (' . $operator . ') in content stream.'; $this->_resolveColorSpace($colorSpace, $info); } /** * Helper method to recursily resolve color space and their alternate color spaces * * @param \SetaPDF_Core_ColorSpace $colorSpace * @param $info */ protected function _resolveColorSpace(\SetaPDF_Core_ColorSpace $colorSpace, $info) { $this->_colorInspector->addFoundColor($colorSpace->getFamily(), $colorSpace, $info); switch (true) { case $colorSpace instanceof \SetaPDF_Core_ColorSpace_Separation: $alternate = $colorSpace->getAlternateColorSpace(); $info = 'Alternate color space for Separation color space.'; $this->_resolveColorSpace($alternate, $info); break; case $colorSpace instanceof \SetaPDF_Core_ColorSpace_DeviceN: $alternate = $colorSpace->getAlternateColorSpace(); $info = 'Alternate color space for DeviceN color space.'; $this->_resolveColorSpace($alternate, $info); break; case $colorSpace instanceof \SetaPDF_Core_ColorSpace_Indexed: $base = $colorSpace->getBase(); $info = 'Base color space for Indexed color space.'; $this->_resolveColorSpace($base, $info); break; case $colorSpace instanceof \SetaPDF_Core_ColorSpace_IccBased: $stream = $colorSpace->getIccProfileStream(); $alternate = $stream->getAlternate(); if ($alternate) { $info = 'Alternate color space for ICC profile color space.'; $this->_resolveColorSpace($alternate, $info); } /* See ICC.1:2010 - Table 19 (ICC1v43_2010-12.pdf) */ $info = 'Color space signature extracted from ICC profile.'; $colorSpace = $stream->getParser()->getColorSpace(); $this->_colorInspector->addFoundColor(trim($colorSpace), $stream, $info); break; } } /** * Callback for painting a XObject * * @param $args */ public function _paintXObject($args) { $name = $args[0]->getValue(); $xObjects = $this->_canvas->getResources(true, false, \SetaPDF_Core_Resource::TYPE_X_OBJECT); if ($xObjects === false) { return; } $xObjectIndirectObject = $xObjects->getValue($name); if (!($xObjectIndirectObject instanceof \SetaPDF_Core_Type_IndirectReference)) { return; } $xObject = \SetaPDF_Core_XObject::get($xObjectIndirectObject); if ($xObject instanceof \SetaPDF_Core_XObject_Image) { $dict = $xObject->getIndirectObject()->ensure()->getValue(); if ($dict->offsetExists('ImageMask') && $dict->getValue('ImageMask')->ensure()->getValue() == true) { return; } $colorSpace = $xObject->getColorSpace(); $info = 'Color space of an image used in a content stream.'; $this->_resolveColorSpace($colorSpace, $info); } elseif ($xObject instanceof \SetaPDF_Core_XObject_Form) { /* Get the colorspace from the transparency group */ $group = $xObject->getGroup(); if ($group instanceof \SetaPDF_Core_TransparencyGroup) { $colorSpace = $group->getColorSpace(true); if ($colorSpace !== null) { $info = 'Color space from Transparency Group of XObject.'; $this->_resolveColorSpace(\SetaPDF_Core_ColorSpace::createByDefinition($colorSpace), $info); } } /* We got a Form XObject - start recusrive processing */ $streamProcessor = new self($xObject->getCanvas(), $this->_colorInspector); $streamProcessor->process(); } } /** * Callback for inline image operator * * @param $args */ public function _startInlineImageData($args) { $dict = new \SetaPDF_Core_Type_Dictionary(); for ($i = 0, $c = count($args); $i < $c; $i += 2) { $dict[$args[$i]] = $args[$i + 1]; } $colorSpace = $dict->offsetExists('CS') ? $dict->getValue('CS') : $dict->getValue('ColorSpace'); if (null === $colorSpace) { return; } $colorSpace = $colorSpace->getValue(); switch ($colorSpace) { case 'G': $colorSpace = 'DeviceGray'; break; case 'RGB': $colorSpace = 'DeviceRGB'; break; case 'CMYK': $colorSpace = 'DeviceCMYK'; break; case 'I': $colorSpace = 'Indexed'; break; } $info = 'Color space of an inline image in content stream.'; $this->_colorInspector->addFoundColor($colorSpace, \SetaPDF_Core_ColorSpace::createByDefinition($colorSpace), $info); } /** * Callback for shading operator * * @param array $args */ public function _paintShapeAndColourShading($args) { $name = $args[0]->getValue(); $shadings = $this->_canvas->getResources(true, false, \SetaPDF_Core_Resource::TYPE_SHADING); if ($shadings === false) { return; } $shadingIndirectObject = $shadings->getValue($name); if (!($shadingIndirectObject instanceof \SetaPDF_Core_Type_IndirectReference)) { return; } try { /** @var \SetaPDF_Core_Type_Dictionary $shading */ $shading = $shadingIndirectObject->ensure(); } catch (\SetaPDF_Core_Type_IndirectReference_Exception $e) { return; } if ($shading instanceof \SetaPDF_Core_Type_Stream) { $shading = $shading->getValue(); } $colorSpaceValue = $shading->getValue('ColorSpace'); if ($colorSpaceValue === null) { return; } $colorSpace = \SetaPDF_Core_ColorSpace::createByDefinition($colorSpaceValue); $info = 'Paint shading operator in content stream.'; $this->_resolveColorSpace($colorSpace, $info); } /** * Process the content stream */ public function process() { try { $stream = $this->_canvas->getStream(); } catch (SetaPDF_Core_Filter_Exception $e) { // if a stream cannot be unfiltered, we ignore it return; } $this->_parser = new \SetaPDF_Core_Parser_Content($stream); /* Register colorspace operators * f.g. -> /DeviceRGB CS % Set DeviceRGB colour space */ $this->_parser->registerOperator( ['CS', 'cs'], [$this, '_colorSpace'] ); /* Register default color space operators */ $this->_parser->registerOperator( ['G', 'g', 'RG', 'rg', 'K', 'k'], [$this, '_color'] ); /* Register draw operator for XObjects */ $this->_parser->registerOperator('Do', [$this, '_paintXObject']); /* Inline image */ $this->_parser->registerOperator('ID', [$this, '_startInlineImageData']); /* Shading Operator */ $this->_parser->registerOperator('sh', [$this, '_paintShapeAndColourShading']); $this->_parser->process(); } }
ColorInspector.php
PHP
<?php /** * Class ColorInspector */ class ColorInspector { /** * @var \SetaPDF_Core_Document */ protected $_document; /** * All found color definitions * * @var array */ protected $_colors = []; /** * Information about the currently processed "location" * * @var string */ protected $_currentLocation; /** * The constructor * * @param \SetaPDF_Core_Document $document */ public function __construct(\SetaPDF_Core_Document $document) { $this->_document = $document; } /** * Get all used colors * * @param bool $processAnnotations Set to false to ignore color definitions in annotation appearance streams * @param null|int $maxPages The maximum of pages to process * @return array */ public function getColors($processAnnotations = true, $maxPages = null) { $pages = $this->_document->getCatalog()->getPages(); $pageCount = $pages->count(); $maxPages = $maxPages === null ? $pageCount : min($maxPages, $pageCount); for ($pageNo = 1; $pageNo <= $maxPages; $pageNo++) { $this->_currentLocation = 'Page ' . $pageNo; $page = $pages->getPage($pageNo); $canvas = $page->getCanvas(); $streamProcessor = new \StreamProcessor($canvas, $this); $streamProcessor->process(); if (false == $processAnnotations) continue; $annotations = $page->getAnnotations(); $allAnnotations = $annotations->getAll(); foreach ($allAnnotations AS $annotation) { $dict = $annotation->getDictionary(); $ap = $dict->getValue('AP'); if (null === $ap) continue; $this->_currentLocation = 'Annotation (' . $dict->getValue('Subtype')->getValue() . ') on Page ' . $pageNo; foreach ($ap AS $type => $value) { $object = $value->ensure(); if ($object instanceof \SetaPDF_Core_Type_Stream) { $streamProcessor = new \StreamProcessor($annotation->getAppearance($type)->getCanvas(), $this); $streamProcessor->process(); } elseif ($object instanceof \SetaPDF_Core_Type_Dictionary) { foreach ($object AS $subType => $subValue) { $subOject = $subValue->ensure(); if ($subOject instanceof \SetaPDF_Core_Type_Stream) { $streamProcessor = new \StreamProcessor($annotation->getAppearance($type, $subType)->getCanvas(), $this); $streamProcessor->process(); } } } } } } return $this->_colors; } /** * A method which will register found color definitions. * * @param $colorSpace * @param null $data * @param null $info */ public function addFoundColor($colorSpace, $data = null, $info = null) { $this->_colors[] = [ 'colorSpace' => $colorSpace, 'data' => $data, 'info' => $info, 'location' => $this->_currentLocation, ]; } }