[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/lib/spout/src/Spout/Reader/XLSX/Helper/ -> SharedStringsHelper.php (source)

   1  <?php
   2  
   3  namespace Box\Spout\Reader\XLSX\Helper;
   4  
   5  use Box\Spout\Common\Exception\IOException;
   6  use Box\Spout\Reader\Exception\XMLProcessingException;
   7  use Box\Spout\Reader\Wrapper\SimpleXMLElement;
   8  use Box\Spout\Reader\Wrapper\XMLReader;
   9  use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
  10  use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
  11  
  12  /**
  13   * Class SharedStringsHelper
  14   * This class provides helper functions for reading sharedStrings XML file
  15   *
  16   * @package Box\Spout\Reader\XLSX\Helper
  17   */
  18  class SharedStringsHelper
  19  {
  20      /** Path of sharedStrings XML file inside the XLSX file */
  21      const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
  22  
  23      /** Main namespace for the sharedStrings.xml file */
  24      const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
  25  
  26      /** @var string Path of the XLSX file being read */
  27      protected $filePath;
  28  
  29      /** @var string Temporary folder where the temporary files to store shared strings will be stored */
  30      protected $tempFolder;
  31  
  32      /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
  33      protected $cachingStrategy;
  34  
  35      /**
  36       * @param string $filePath Path of the XLSX file being read
  37       * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
  38       */
  39      public function __construct($filePath, $tempFolder = null)
  40      {
  41          $this->filePath = $filePath;
  42          $this->tempFolder = $tempFolder;
  43      }
  44  
  45      /**
  46       * Returns whether the XLSX file contains a shared strings XML file
  47       *
  48       * @return bool
  49       */
  50      public function hasSharedStrings()
  51      {
  52          $hasSharedStrings = false;
  53          $zip = new \ZipArchive();
  54  
  55          if ($zip->open($this->filePath) === true) {
  56              $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
  57              $zip->close();
  58          }
  59  
  60          return $hasSharedStrings;
  61      }
  62  
  63      /**
  64       * Builds an in-memory array containing all the shared strings of the sheet.
  65       * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  66       * It is then accessed by the sheet data, via the string index in the built table.
  67       *
  68       * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  69       *
  70       * The XML file can be really big with sheets containing a lot of data. That is why
  71       * we need to use a XML reader that provides streaming like the XMLReader library.
  72       * Please note that SimpleXML does not provide such a functionality but since it is faster
  73       * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
  74       *
  75       * @return void
  76       * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
  77       */
  78      public function extractSharedStrings()
  79      {
  80          $xmlReader = new XMLReader();
  81          $sharedStringIndex = 0;
  82          /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
  83          $escaper = new \Box\Spout\Common\Escaper\XLSX();
  84  
  85          $sharedStringsFilePath = $this->getSharedStringsFilePath();
  86          if ($xmlReader->open($sharedStringsFilePath) === false) {
  87              throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
  88          }
  89  
  90          try {
  91              $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
  92              $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
  93  
  94              $xmlReader->readUntilNodeFound('si');
  95  
  96              while ($xmlReader->name === 'si') {
  97                  $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
  98                  $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
  99  
 100                  // removes nodes that should not be read, like the pronunciation of the Kanji characters
 101                  $cleanNode = $this->removeSuperfluousTextNodes($node);
 102  
 103                  // find all text nodes 't'; there can be multiple if the cell contains formatting
 104                  $textNodes = $cleanNode->xpath('//ns:t');
 105  
 106                  $textValue = '';
 107                  foreach ($textNodes as $textNode) {
 108                      if ($this->shouldPreserveWhitespace($textNode)) {
 109                          $textValue .= $textNode->__toString();
 110                      } else {
 111                          $textValue .= trim($textNode->__toString());
 112                      }
 113                  }
 114  
 115                  $unescapedTextValue = $escaper->unescape($textValue);
 116                  $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
 117  
 118                  $sharedStringIndex++;
 119  
 120                  // jump to the next 'si' tag
 121                  $xmlReader->next('si');
 122              }
 123  
 124          } catch (XMLProcessingException $exception) {
 125              throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
 126          }
 127  
 128          $this->cachingStrategy->closeCache();
 129  
 130          $xmlReader->close();
 131      }
 132  
 133      /**
 134       * @return string The path to the shared strings XML file
 135       */
 136      protected function getSharedStringsFilePath()
 137      {
 138          return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
 139      }
 140  
 141      /**
 142       * Returns the shared strings unique count, as specified in <sst> tag.
 143       *
 144       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
 145       * @return int Number of unique shared strings in the sharedStrings.xml file
 146       * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
 147       */
 148      protected function getSharedStringsUniqueCount($xmlReader)
 149      {
 150          $xmlReader->next('sst');
 151  
 152          // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
 153          while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) {
 154              $xmlReader->read();
 155          }
 156  
 157          return intval($xmlReader->getAttribute('uniqueCount'));
 158      }
 159  
 160      /**
 161       * Returns the best shared strings caching strategy.
 162       *
 163       * @param int $sharedStringsUniqueCount
 164       * @return CachingStrategyInterface
 165       */
 166      protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
 167      {
 168          return CachingStrategyFactory::getInstance()
 169                  ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
 170      }
 171  
 172      /**
 173       * Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
 174       * This is to simplify the parsing of the subtree.
 175       *
 176       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
 177       * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement
 178       * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
 179       */
 180      protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
 181      {
 182          $node = null;
 183          try {
 184              $node = new SimpleXMLElement($xmlReader->readOuterXml());
 185          } catch (XMLProcessingException $exception) {
 186              throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
 187          }
 188  
 189          return $node;
 190      }
 191  
 192      /**
 193       * Removes nodes that should not be read, like the pronunciation of the Kanji characters.
 194       * By keeping them, their text content would be added to the read string.
 195       *
 196       * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
 197       * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
 198       */
 199      protected function removeSuperfluousTextNodes($parentNode)
 200      {
 201          $tagsToRemove = [
 202              'rPh', // Pronunciation of the text
 203          ];
 204  
 205          foreach ($tagsToRemove as $tagToRemove) {
 206              $xpath = '//ns:' . $tagToRemove;
 207              $parentNode->removeNodesMatchingXPath($xpath);
 208          }
 209  
 210          return $parentNode;
 211      }
 212  
 213      /**
 214       * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
 215       *
 216       * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
 217       * @return bool Whether whitespace should be preserved
 218       */
 219      protected function shouldPreserveWhitespace($textNode)
 220      {
 221          $spaceValue = $textNode->getAttribute('space', 'xml');
 222          return ($spaceValue === 'preserve');
 223      }
 224  
 225      /**
 226       * Returns the shared string at the given index, using the previously chosen caching strategy.
 227       *
 228       * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
 229       * @return string The shared string at the given index
 230       * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
 231       */
 232      public function getStringAtIndex($sharedStringIndex)
 233      {
 234          return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
 235      }
 236  
 237      /**
 238       * Destroys the cache, freeing memory and removing any created artifacts
 239       *
 240       * @return void
 241       */
 242      public function cleanup()
 243      {
 244          if ($this->cachingStrategy) {
 245              $this->cachingStrategy->clearCache();
 246          }
 247      }
 248  }


Generated: Thu Aug 11 10:00:09 2016 Cross-referenced by PHPXref 0.7.1