[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace Box\Spout\Reader\XLSX\Helper; 4 5 use Box\Spout\Common\Exception\IOException; 6 use Box\Spout\Reader\Exception\XMLProcessingException; 7 use Box\Spout\Reader\Wrapper\SimpleXMLElement; 8 use Box\Spout\Reader\Wrapper\XMLReader; 9 use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory; 10 use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface; 11 12 /** 13 * Class SharedStringsHelper 14 * This class provides helper functions for reading sharedStrings XML file 15 * 16 * @package Box\Spout\Reader\XLSX\Helper 17 */ 18 class SharedStringsHelper 19 { 20 /** Path of sharedStrings XML file inside the XLSX file */ 21 const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml'; 22 23 /** Main namespace for the sharedStrings.xml file */ 24 const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; 25 26 /** @var string Path of the XLSX file being read */ 27 protected $filePath; 28 29 /** @var string Temporary folder where the temporary files to store shared strings will be stored */ 30 protected $tempFolder; 31 32 /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ 33 protected $cachingStrategy; 34 35 /** 36 * @param string $filePath Path of the XLSX file being read 37 * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored 38 */ 39 public function __construct($filePath, $tempFolder = null) 40 { 41 $this->filePath = $filePath; 42 $this->tempFolder = $tempFolder; 43 } 44 45 /** 46 * Returns whether the XLSX file contains a shared strings XML file 47 * 48 * @return bool 49 */ 50 public function hasSharedStrings() 51 { 52 $hasSharedStrings = false; 53 $zip = new \ZipArchive(); 54 55 if ($zip->open($this->filePath) === true) { 56 $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false); 57 $zip->close(); 58 } 59 60 return $hasSharedStrings; 61 } 62 63 /** 64 * Builds an in-memory array containing all the shared strings of the sheet. 65 * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. 66 * It is then accessed by the sheet data, via the string index in the built table. 67 * 68 * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx 69 * 70 * The XML file can be really big with sheets containing a lot of data. That is why 71 * we need to use a XML reader that provides streaming like the XMLReader library. 72 * Please note that SimpleXML does not provide such a functionality but since it is faster 73 * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. 74 * 75 * @return void 76 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read 77 */ 78 public function extractSharedStrings() 79 { 80 $xmlReader = new XMLReader(); 81 $sharedStringIndex = 0; 82 /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ 83 $escaper = new \Box\Spout\Common\Escaper\XLSX(); 84 85 $sharedStringsFilePath = $this->getSharedStringsFilePath(); 86 if ($xmlReader->open($sharedStringsFilePath) === false) { 87 throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); 88 } 89 90 try { 91 $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); 92 $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); 93 94 $xmlReader->readUntilNodeFound('si'); 95 96 while ($xmlReader->name === 'si') { 97 $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); 98 $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); 99 100 // removes nodes that should not be read, like the pronunciation of the Kanji characters 101 $cleanNode = $this->removeSuperfluousTextNodes($node); 102 103 // find all text nodes 't'; there can be multiple if the cell contains formatting 104 $textNodes = $cleanNode->xpath('//ns:t'); 105 106 $textValue = ''; 107 foreach ($textNodes as $textNode) { 108 if ($this->shouldPreserveWhitespace($textNode)) { 109 $textValue .= $textNode->__toString(); 110 } else { 111 $textValue .= trim($textNode->__toString()); 112 } 113 } 114 115 $unescapedTextValue = $escaper->unescape($textValue); 116 $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); 117 118 $sharedStringIndex++; 119 120 // jump to the next 'si' tag 121 $xmlReader->next('si'); 122 } 123 124 } catch (XMLProcessingException $exception) { 125 throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); 126 } 127 128 $this->cachingStrategy->closeCache(); 129 130 $xmlReader->close(); 131 } 132 133 /** 134 * @return string The path to the shared strings XML file 135 */ 136 protected function getSharedStringsFilePath() 137 { 138 return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; 139 } 140 141 /** 142 * Returns the shared strings unique count, as specified in <sst> tag. 143 * 144 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance 145 * @return int Number of unique shared strings in the sharedStrings.xml file 146 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read 147 */ 148 protected function getSharedStringsUniqueCount($xmlReader) 149 { 150 $xmlReader->next('sst'); 151 152 // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) 153 while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) { 154 $xmlReader->read(); 155 } 156 157 return intval($xmlReader->getAttribute('uniqueCount')); 158 } 159 160 /** 161 * Returns the best shared strings caching strategy. 162 * 163 * @param int $sharedStringsUniqueCount 164 * @return CachingStrategyInterface 165 */ 166 protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) 167 { 168 return CachingStrategyFactory::getInstance() 169 ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); 170 } 171 172 /** 173 * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. 174 * This is to simplify the parsing of the subtree. 175 * 176 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader 177 * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement 178 * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read 179 */ 180 protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) 181 { 182 $node = null; 183 try { 184 $node = new SimpleXMLElement($xmlReader->readOuterXml()); 185 } catch (XMLProcessingException $exception) { 186 throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}]."); 187 } 188 189 return $node; 190 } 191 192 /** 193 * Removes nodes that should not be read, like the pronunciation of the Kanji characters. 194 * By keeping them, their text content would be added to the read string. 195 * 196 * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove 197 * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node 198 */ 199 protected function removeSuperfluousTextNodes($parentNode) 200 { 201 $tagsToRemove = [ 202 'rPh', // Pronunciation of the text 203 ]; 204 205 foreach ($tagsToRemove as $tagToRemove) { 206 $xpath = '//ns:' . $tagToRemove; 207 $parentNode->removeNodesMatchingXPath($xpath); 208 } 209 210 return $parentNode; 211 } 212 213 /** 214 * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. 215 * 216 * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved 217 * @return bool Whether whitespace should be preserved 218 */ 219 protected function shouldPreserveWhitespace($textNode) 220 { 221 $spaceValue = $textNode->getAttribute('space', 'xml'); 222 return ($spaceValue === 'preserve'); 223 } 224 225 /** 226 * Returns the shared string at the given index, using the previously chosen caching strategy. 227 * 228 * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file 229 * @return string The shared string at the given index 230 * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index 231 */ 232 public function getStringAtIndex($sharedStringIndex) 233 { 234 return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); 235 } 236 237 /** 238 * Destroys the cache, freeing memory and removing any created artifacts 239 * 240 * @return void 241 */ 242 public function cleanup() 243 { 244 if ($this->cachingStrategy) { 245 $this->cachingStrategy->clearCache(); 246 } 247 } 248 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |