[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 // This file is part of Moodle - http://moodle.org/ 3 // 4 // Moodle is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // Moodle is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU General Public License for more details. 13 // 14 // You should have received a copy of the GNU General Public License 15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>. 16 17 /** 18 * Solr engine. 19 * 20 * @package search_solr 21 * @copyright 2015 Daniel Neis Araujo 22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 23 */ 24 25 namespace search_solr; 26 27 defined('MOODLE_INTERNAL') || die(); 28 29 /** 30 * Solr engine. 31 * 32 * @package search_solr 33 * @copyright 2015 Daniel Neis Araujo 34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 35 */ 36 class engine extends \core_search\engine { 37 38 /** 39 * @var string The date format used by solr. 40 */ 41 const DATE_FORMAT = 'Y-m-d\TH:i:s\Z'; 42 43 /** 44 * @var int Commit documents interval (number of miliseconds). 45 */ 46 const AUTOCOMMIT_WITHIN = 15000; 47 48 /** 49 * The maximum number of results to fetch at a time. 50 */ 51 const QUERY_SIZE = 120; 52 53 /** 54 * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending. 55 */ 56 const FRAG_SIZE = 510; 57 58 /** 59 * Marker for the start of a highlight. 60 */ 61 const HIGHLIGHT_START = '@@HI_S@@'; 62 63 /** 64 * Marker for the end of a highlight. 65 */ 66 const HIGHLIGHT_END = '@@HI_E@@'; 67 68 /** 69 * @var \SolrClient 70 */ 71 protected $client = null; 72 73 /** 74 * @var bool True if we should reuse SolrClients, false if not. 75 */ 76 protected $cacheclient = true; 77 78 /** 79 * @var \curl Direct curl object. 80 */ 81 protected $curl = null; 82 83 /** 84 * @var array Fields that can be highlighted. 85 */ 86 protected $highlightfields = array('title', 'content', 'description1', 'description2'); 87 88 /** 89 * @var int Number of total docs reported by Sorl for the last query. 90 */ 91 protected $totalenginedocs = 0; 92 93 /** 94 * @var int Number of docs we have processed for the last query. 95 */ 96 protected $processeddocs = 0; 97 98 /** 99 * @var int Number of docs that have been skipped while processing the last query. 100 */ 101 protected $skippeddocs = 0; 102 103 /** 104 * Initialises the search engine configuration. 105 * 106 * @return void 107 */ 108 public function __construct() { 109 parent::__construct(); 110 111 $curlversion = curl_version(); 112 if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) { 113 // There is a flaw with curl 7.35.0 that causes problems with client reuse. 114 $this->cacheclient = false; 115 } 116 } 117 118 /** 119 * Prepares a Solr query, applies filters and executes it returning its results. 120 * 121 * @throws \core_search\engine_exception 122 * @param stdClass $filters Containing query and filters. 123 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts. 124 * @param int $limit The maximum number of results to return. 125 * @return \core_search\document[] Results or false if no results 126 */ 127 public function execute_query($filters, $usercontexts, $limit = 0) { 128 global $USER; 129 130 if (empty($limit)) { 131 $limit = \core_search\manager::MAX_RESULTS; 132 } 133 134 // If there is any problem we trigger the exception as soon as possible. 135 $client = $this->get_search_client(); 136 137 // Create the query object. 138 $query = $this->create_user_query($filters, $usercontexts); 139 140 // We expect good match rates, so for our first get, we will get a small number of records. 141 // This significantly speeds solr response time for first few pages. 142 $query->setRows(min($limit * 3, static::QUERY_SIZE)); 143 $response = $this->get_query_response($query); 144 145 // Get count data out of the response, and reset our counters. 146 list($included, $found) = $this->get_response_counts($response); 147 $this->totalenginedocs = $found; 148 $this->processeddocs = 0; 149 $this->skippeddocs = 0; 150 if ($included == 0 || $this->totalenginedocs == 0) { 151 // No results. 152 return array(); 153 } 154 155 // Get valid documents out of the response. 156 $results = $this->process_response($response, $limit); 157 158 // We have processed all the docs in the response at this point. 159 $this->processeddocs += $included; 160 161 // If we haven't reached the limit, and there are more docs left in Solr, lets keep trying. 162 while (count($results) < $limit && ($this->totalenginedocs - $this->processeddocs) > 0) { 163 // Offset the start of the query, and since we are making another call, get more per call. 164 $query->setStart($this->processeddocs); 165 $query->setRows(static::QUERY_SIZE); 166 167 $response = $this->get_query_response($query); 168 list($included, $found) = $this->get_response_counts($response); 169 if ($included == 0 || $found == 0) { 170 // No new results were found. Found being empty would be weird, so we will just return. 171 return $results; 172 } 173 $this->totalenginedocs = $found; 174 175 // Get the new response docs, limiting to remaining we need, then add it to the end of the results array. 176 $newdocs = $this->process_response($response, $limit - count($results)); 177 $results = array_merge($results, $newdocs); 178 179 // Add to our processed docs count. 180 $this->processeddocs += $included; 181 } 182 183 return $results; 184 } 185 186 /** 187 * Takes a query and returns the response in SolrObject format. 188 * 189 * @param SolrQuery $query Solr query object. 190 * @return SolrObject|false Response document or false on error. 191 */ 192 protected function get_query_response($query) { 193 try { 194 return $this->get_search_client()->query($query)->getResponse(); 195 } catch (\SolrClientException $ex) { 196 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); 197 $this->queryerror = $ex->getMessage(); 198 return false; 199 } catch (\SolrServerException $ex) { 200 debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER); 201 $this->queryerror = $ex->getMessage(); 202 return false; 203 } 204 } 205 206 /** 207 * Returns the total number of documents available for the most recently call to execute_query. 208 * 209 * @return int 210 */ 211 public function get_query_total_count() { 212 // Return the total engine count minus the docs we have determined are bad. 213 return $this->totalenginedocs - $this->skippeddocs; 214 } 215 216 /** 217 * Returns count information for a provided response. Will return 0, 0 for invalid or empty responses. 218 * 219 * @param SolrDocument $response The response document from Solr. 220 * @return array A two part array. First how many response docs are in the response. 221 * Second, how many results are vailable in the engine. 222 */ 223 protected function get_response_counts($response) { 224 $found = 0; 225 $included = 0; 226 227 if (isset($response->grouped->solr_filegroupingid->ngroups)) { 228 // Get the number of results for file grouped queries. 229 $found = $response->grouped->solr_filegroupingid->ngroups; 230 $included = count($response->grouped->solr_filegroupingid->groups); 231 } else if (isset($response->response->numFound)) { 232 // Get the number of results for standard queries. 233 $found = $response->response->numFound; 234 $included = count($response->response->docs); 235 } 236 237 return array($included, $found); 238 } 239 240 /** 241 * Prepares a new query object with needed limits, filters, etc. 242 * 243 * @param stdClass $filters Containing query and filters. 244 * @param array $usercontexts Contexts where the user has access. True if the user can access all contexts. 245 * @return SolrDisMaxQuery 246 */ 247 protected function create_user_query($filters, $usercontexts) { 248 global $USER; 249 250 // Let's keep these changes internal. 251 $data = clone $filters; 252 253 $query = new \SolrDisMaxQuery(); 254 255 $this->set_query($query, $data->q); 256 $this->add_fields($query); 257 258 // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters 259 // we are really interested in caching contexts filters instead. 260 if (!empty($data->title)) { 261 $query->addFilterQuery('{!field cache=false f=title}' . $data->title); 262 } 263 if (!empty($data->areaids)) { 264 // If areaids are specified, we want to get any that match. 265 $query->addFilterQuery('{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')'); 266 } 267 if (!empty($data->courseids)) { 268 $query->addFilterQuery('{!cache=false}courseid:(' . implode(' OR ', $data->courseids) . ')'); 269 } 270 271 if (!empty($data->timestart) or !empty($data->timeend)) { 272 if (empty($data->timestart)) { 273 $data->timestart = '*'; 274 } else { 275 $data->timestart = \search_solr\document::format_time_for_engine($data->timestart); 276 } 277 if (empty($data->timeend)) { 278 $data->timeend = '*'; 279 } else { 280 $data->timeend = \search_solr\document::format_time_for_engine($data->timeend); 281 } 282 283 // No cache. 284 $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']'); 285 } 286 287 // Restrict to users who are supposed to be able to see a particular result. 288 $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')'); 289 290 // And finally restrict it to the context where the user can access, we want this one cached. 291 // If the user can access all contexts $usercontexts value is just true, we don't need to filter 292 // in that case. 293 if ($usercontexts && is_array($usercontexts)) { 294 // Join all area contexts into a single array and implode. 295 $allcontexts = array(); 296 foreach ($usercontexts as $areaid => $areacontexts) { 297 if (!empty($data->areaids) && !in_array($areaid, $data->areaids)) { 298 // Skip unused areas. 299 continue; 300 } 301 foreach ($areacontexts as $contextid) { 302 // Ensure they are unique. 303 $allcontexts[$contextid] = $contextid; 304 } 305 } 306 if (empty($allcontexts)) { 307 // This means there are no valid contexts for them, so they get no results. 308 return array(); 309 } 310 $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')'); 311 } 312 313 if ($this->file_indexing_enabled()) { 314 // Now group records by solr_filegroupingid. Limit to 3 results per group. 315 $query->setGroup(true); 316 $query->setGroupLimit(3); 317 $query->setGroupNGroups(true); 318 $query->addGroupField('solr_filegroupingid'); 319 } else { 320 // Make sure we only get text files, in case the index has pre-existing files. 321 $query->addFilterQuery('type:'.\core_search\manager::TYPE_TEXT); 322 } 323 324 return $query; 325 } 326 327 /** 328 * Prepares a new query by setting the query, start offset and rows to return. 329 * 330 * @param SolrQuery $query 331 * @param object $q Containing query and filters. 332 */ 333 protected function set_query($query, $q) { 334 // Set hightlighting. 335 $query->setHighlight(true); 336 foreach ($this->highlightfields as $field) { 337 $query->addHighlightField($field); 338 } 339 $query->setHighlightFragsize(static::FRAG_SIZE); 340 $query->setHighlightSimplePre(self::HIGHLIGHT_START); 341 $query->setHighlightSimplePost(self::HIGHLIGHT_END); 342 $query->setHighlightMergeContiguous(true); 343 344 $query->setQuery($q); 345 346 // A reasonable max. 347 $query->setRows(static::QUERY_SIZE); 348 } 349 350 /** 351 * Sets fields to be returned in the result. 352 * 353 * @param SolrDisMaxQuery|SolrQuery $query object. 354 */ 355 public function add_fields($query) { 356 $documentclass = $this->get_document_classname(); 357 $fields = $documentclass::get_default_fields_definition(); 358 359 $dismax = false; 360 if ($query instanceof \SolrDisMaxQuery) { 361 $dismax = true; 362 } 363 364 foreach ($fields as $key => $field) { 365 $query->addField($key); 366 if ($dismax && !empty($field['mainquery'])) { 367 // Add fields the main query should be run against. 368 $query->addQueryField($key); 369 } 370 } 371 } 372 373 /** 374 * Finds the key common to both highlighing and docs array returned from response. 375 * @param object $response containing results. 376 */ 377 public function add_highlight_content($response) { 378 if (!isset($response->highlighting)) { 379 // There is no highlighting to add. 380 return; 381 } 382 383 $highlightedobject = $response->highlighting; 384 foreach ($response->response->docs as $doc) { 385 $x = $doc->id; 386 $highlighteddoc = $highlightedobject->$x; 387 $this->merge_highlight_field_values($doc, $highlighteddoc); 388 } 389 } 390 391 /** 392 * Adds the highlighting array values to docs array values. 393 * 394 * @throws \core_search\engine_exception 395 * @param object $doc containing the results. 396 * @param object $highlighteddoc containing the highlighted results values. 397 */ 398 public function merge_highlight_field_values($doc, $highlighteddoc) { 399 400 foreach ($this->highlightfields as $field) { 401 if (!empty($doc->$field)) { 402 403 // Check that the returned value is not an array. No way we can make this work with multivalued solr fields. 404 if (is_array($doc->{$field})) { 405 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field); 406 } 407 408 if (!empty($highlighteddoc->$field)) { 409 // Replace by the highlighted result. 410 $doc->$field = reset($highlighteddoc->$field); 411 } 412 } 413 } 414 } 415 416 /** 417 * Filters the response on Moodle side. 418 * 419 * @param SolrObject $response Solr object containing the response return from solr server. 420 * @param int $limit The maximum number of results to return. 0 for all. 421 * @param bool $skipaccesscheck Don't use check_access() on results. Only to be used when results have known access. 422 * @return array $results containing final results to be displayed. 423 */ 424 protected function process_response($response, $limit = 0, $skipaccesscheck = false) { 425 global $USER; 426 427 if (empty($response)) { 428 return array(); 429 } 430 431 if (isset($response->grouped)) { 432 return $this->grouped_files_process_response($response, $limit); 433 } 434 435 $userid = $USER->id; 436 $noownerid = \core_search\manager::NO_OWNER_ID; 437 438 $numgranted = 0; 439 440 if (!$docs = $response->response->docs) { 441 return array(); 442 } 443 444 $out = array(); 445 if (!empty($response->response->numFound)) { 446 $this->add_highlight_content($response); 447 448 // Iterate through the results checking its availability and whether they are available for the user or not. 449 foreach ($docs as $key => $docdata) { 450 if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) { 451 // If owneruserid is set, no other user should be able to access this record. 452 continue; 453 } 454 455 if (!$searcharea = $this->get_search_area($docdata->areaid)) { 456 continue; 457 } 458 459 $docdata = $this->standarize_solr_obj($docdata); 460 461 if ($skipaccesscheck) { 462 $access = \core_search\manager::ACCESS_GRANTED; 463 } else { 464 $access = $searcharea->check_access($docdata['itemid']); 465 } 466 switch ($access) { 467 case \core_search\manager::ACCESS_DELETED: 468 $this->delete_by_id($docdata['id']); 469 // Remove one from our processed and total counters, since we promptly deleted. 470 $this->processeddocs--; 471 $this->totalenginedocs--; 472 break; 473 case \core_search\manager::ACCESS_DENIED: 474 $this->skippeddocs++; 475 break; 476 case \core_search\manager::ACCESS_GRANTED: 477 $numgranted++; 478 479 // Add the doc. 480 $out[] = $this->to_document($searcharea, $docdata); 481 break; 482 } 483 484 // Stop when we hit our limit. 485 if (!empty($limit) && count($out) >= $limit) { 486 break; 487 } 488 } 489 } 490 491 return $out; 492 } 493 494 /** 495 * Processes grouped file results into documents, with attached matching files. 496 * 497 * @param SolrObject $response The response returned from solr server 498 * @param int $limit The maximum number of results to return. 0 for all. 499 * @return array Final results to be displayed. 500 */ 501 protected function grouped_files_process_response($response, $limit = 0) { 502 // If we can't find the grouping, or there are no matches in the grouping, return empty. 503 if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) { 504 return array(); 505 } 506 507 $numgranted = 0; 508 $orderedids = array(); 509 $completedocs = array(); 510 $incompletedocs = array(); 511 512 $highlightingobj = $response->highlighting; 513 514 // Each group represents a "master document". 515 $groups = $response->grouped->solr_filegroupingid->groups; 516 foreach ($groups as $group) { 517 $groupid = $group->groupValue; 518 $groupdocs = $group->doclist->docs; 519 $firstdoc = reset($groupdocs); 520 521 if (!$searcharea = $this->get_search_area($firstdoc->areaid)) { 522 // Well, this is a problem. 523 continue; 524 } 525 526 // Check for access. 527 $access = $searcharea->check_access($firstdoc->itemid); 528 switch ($access) { 529 case \core_search\manager::ACCESS_DELETED: 530 // If deleted from Moodle, delete from index and then continue. 531 $this->delete_by_id($firstdoc->id); 532 // Remove one from our processed and total counters, since we promptly deleted. 533 $this->processeddocs--; 534 $this->totalenginedocs--; 535 continue 2; 536 break; 537 case \core_search\manager::ACCESS_DENIED: 538 // This means we should just skip for the current user. 539 $this->skippeddocs++; 540 continue 2; 541 break; 542 } 543 $numgranted++; 544 545 $maindoc = false; 546 $fileids = array(); 547 // Seperate the main document and any files returned. 548 foreach ($groupdocs as $groupdoc) { 549 if ($groupdoc->id == $groupid) { 550 $maindoc = $groupdoc; 551 } else if (isset($groupdoc->solr_fileid)) { 552 $fileids[] = $groupdoc->solr_fileid; 553 } 554 } 555 556 // Store the id of this group, in order, for later merging. 557 $orderedids[] = $groupid; 558 559 if (!$maindoc) { 560 // We don't have the main doc, store what we know for later building. 561 $incompletedocs[$groupid] = $fileids; 562 } else { 563 if (isset($highlightingobj->$groupid)) { 564 // Merge the highlighting for this doc. 565 $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid); 566 } 567 $docdata = $this->standarize_solr_obj($maindoc); 568 $doc = $this->to_document($searcharea, $docdata); 569 // Now we need to attach the result files to the doc. 570 foreach ($fileids as $fileid) { 571 $doc->add_stored_file($fileid); 572 } 573 $completedocs[$groupid] = $doc; 574 } 575 576 if (!empty($limit) && $numgranted >= $limit) { 577 // We have hit the max results, we will just ignore the rest. 578 break; 579 } 580 } 581 582 $incompletedocs = $this->get_missing_docs($incompletedocs); 583 584 $out = array(); 585 // Now merge the complete and incomplete documents, in results order. 586 foreach ($orderedids as $docid) { 587 if (isset($completedocs[$docid])) { 588 $out[] = $completedocs[$docid]; 589 } else if (isset($incompletedocs[$docid])) { 590 $out[] = $incompletedocs[$docid]; 591 } 592 } 593 594 return $out; 595 } 596 597 /** 598 * Retreive any missing main documents and attach provided files. 599 * 600 * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value 601 * associated to the key should be an array of stored_files or stored file ids to attach to the result document. 602 * 603 * Return array also indexed by document id. 604 * 605 * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach. 606 * @return document[] 607 */ 608 protected function get_missing_docs($missingdocs) { 609 if (empty($missingdocs)) { 610 return array(); 611 } 612 613 $docids = array_keys($missingdocs); 614 615 // Build a custom query that will get all the missing documents. 616 $query = new \SolrQuery(); 617 $this->set_query($query, '*'); 618 $this->add_fields($query); 619 $query->setRows(count($docids)); 620 $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')'); 621 622 $response = $this->get_query_response($query); 623 // We know the missing docs have already been checked for access, so don't recheck. 624 $results = $this->process_response($response, 0, true); 625 626 $out = array(); 627 foreach ($results as $result) { 628 $resultid = $result->get('id'); 629 if (!isset($missingdocs[$resultid])) { 630 // We got a result we didn't expect. Skip it. 631 continue; 632 } 633 // Attach the files. 634 foreach ($missingdocs[$resultid] as $filedoc) { 635 $result->add_stored_file($filedoc); 636 } 637 $out[$resultid] = $result; 638 } 639 640 return $out; 641 } 642 643 /** 644 * Returns a standard php array from a \SolrObject instance. 645 * 646 * @param \SolrObject $obj 647 * @return array The returned document as an array. 648 */ 649 public function standarize_solr_obj(\SolrObject $obj) { 650 $properties = $obj->getPropertyNames(); 651 652 $docdata = array(); 653 foreach($properties as $name) { 654 // http://php.net/manual/en/solrobject.getpropertynames.php#98018. 655 $name = trim($name); 656 $docdata[$name] = $obj->offsetGet($name); 657 } 658 return $docdata; 659 } 660 661 /** 662 * Adds a document to the search engine. 663 * 664 * This does not commit to the search engine. 665 * 666 * @param document $document 667 * @param bool $fileindexing True if file indexing is to be used 668 * @return bool 669 */ 670 public function add_document($document, $fileindexing = false) { 671 $docdata = $document->export_for_engine(); 672 673 if (!$this->add_solr_document($docdata)) { 674 return false; 675 } 676 677 if ($fileindexing) { 678 // This will take care of updating all attached files in the index. 679 $this->process_document_files($document); 680 } 681 682 return true; 683 } 684 685 /** 686 * Adds a text document to the search engine. 687 * 688 * @param array $doc 689 * @return bool 690 */ 691 protected function add_solr_document($doc) { 692 $solrdoc = new \SolrInputDocument(); 693 foreach ($doc as $field => $value) { 694 $solrdoc->addField($field, $value); 695 } 696 697 try { 698 $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN); 699 return true; 700 } catch (\SolrClientException $e) { 701 debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER); 702 } catch (\SolrServerException $e) { 703 // We only use the first line of the message, as it's a fully java stacktrace behind it. 704 $msg = strtok($e->getMessage(), "\n"); 705 debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER); 706 } 707 708 return false; 709 } 710 711 /** 712 * Index files attached to the docuemnt, ensuring the index matches the current document files. 713 * 714 * For documents that aren't known to be new, we check the index for existing files. 715 * - New files we will add. 716 * - Existing and unchanged files we will skip. 717 * - File that are in the index but not on the document will be deleted from the index. 718 * - Files that have changed will be re-indexed. 719 * 720 * @param document $document 721 */ 722 protected function process_document_files($document) { 723 if (!$this->file_indexing_enabled()) { 724 return; 725 } 726 727 // Maximum rows to process at a time. 728 $rows = 500; 729 730 // Get the attached files. 731 $files = $document->get_files(); 732 733 // If this isn't a new document, we need to check the exiting indexed files. 734 if (!$document->get_is_new()) { 735 // We do this progressively, so we can handle lots of files cleanly. 736 list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows); 737 $count = 0; 738 $idstodelete = array(); 739 740 do { 741 // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones. 742 foreach ($indexedfiles as $indexedfile) { 743 $fileid = $indexedfile->solr_fileid; 744 745 if (isset($files[$fileid])) { 746 // Check for changes that would mean we need to re-index the file. If so, just leave in $files. 747 // Filelib does not guarantee time modified is updated, so we will check important values. 748 if ($indexedfile->modified < $files[$fileid]->get_timemodified()) { 749 continue; 750 } 751 if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) { 752 continue; 753 } 754 if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) { 755 continue; 756 } 757 if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE && 758 $this->file_is_indexable($files[$fileid])) { 759 // This means that the last time we indexed this file, filtering blocked it. 760 // Current settings say it is indexable, so we will allow it to be indexed. 761 continue; 762 } 763 764 // If the file is already indexed, we can just remove it from the files array and skip it. 765 unset($files[$fileid]); 766 } else { 767 // This means we have found a file that is no longer attached, so we need to delete from the index. 768 // We do it later, since this is progressive, and it could reorder results. 769 $idstodelete[] = $indexedfile->id; 770 } 771 } 772 $count += $rows; 773 774 if ($count < $numfound) { 775 // If we haven't hit the total count yet, fetch the next batch. 776 list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows); 777 } 778 779 } while ($count < $numfound); 780 781 // Delete files that are no longer attached. 782 foreach ($idstodelete as $id) { 783 // We directly delete the item using the client, as the engine delete_by_id won't work on file docs. 784 $this->get_search_client()->deleteById($id); 785 } 786 } 787 788 // Now we can actually index all the remaining files. 789 foreach ($files as $file) { 790 $this->add_stored_file($document, $file); 791 } 792 } 793 794 /** 795 * Get the currently indexed files for a particular document, returns the total count, and a subset of files. 796 * 797 * @param document $document 798 * @param int $start The row to start the results on. Zero indexed. 799 * @param int $rows The number of rows to fetch 800 * @return array A two element array, the first is the total number of availble results, the second is an array 801 * of documents for the current request. 802 */ 803 protected function get_indexed_files($document, $start = 0, $rows = 500) { 804 // Build a custom query that will get any document files that are in our solr_filegroupingid. 805 $query = new \SolrQuery(); 806 807 // We want to get all file records tied to a document. 808 // For efficiency, we are building our own, stripped down, query. 809 $query->setQuery('*'); 810 $query->setRows($rows); 811 $query->setStart($start); 812 // We want a consistent sorting. 813 $query->addSortField('id'); 814 815 // We only want the bare minimum of fields. 816 $query->addField('id'); 817 $query->addField('modified'); 818 $query->addField('title'); 819 $query->addField('solr_fileid'); 820 $query->addField('solr_filecontenthash'); 821 $query->addField('solr_fileindexstatus'); 822 823 $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')'); 824 $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE); 825 826 $response = $this->get_query_response($query); 827 if (empty($response->response->numFound)) { 828 return array(0, array()); 829 } 830 831 return array($response->response->numFound, $this->convert_file_results($response)); 832 } 833 834 /** 835 * A very lightweight handler for getting information about already indexed files from a Solr response. 836 * 837 * @param SolrObject $responsedoc A Solr response document 838 * @return stdClass[] An array of objects that contain the basic information for file processing. 839 */ 840 protected function convert_file_results($responsedoc) { 841 if (!$docs = $responsedoc->response->docs) { 842 return array(); 843 } 844 845 $out = array(); 846 847 foreach ($docs as $doc) { 848 // Copy the bare minimim needed info. 849 $result = new \stdClass(); 850 $result->id = $doc->id; 851 $result->modified = document::import_time_from_engine($doc->modified); 852 $result->title = $doc->title; 853 $result->solr_fileid = $doc->solr_fileid; 854 $result->solr_filecontenthash = $doc->solr_filecontenthash; 855 $result->solr_fileindexstatus = $doc->solr_fileindexstatus; 856 $out[] = $result; 857 } 858 859 return $out; 860 } 861 862 /** 863 * Adds a file to the search engine. 864 * 865 * Notes about Solr and Tika indexing. We do not send the mime type, only the filename. 866 * Tika has much better content type detection than Moodle, and we will have many more doc failures 867 * if we try to send mime types. 868 * 869 * @param document $document 870 * @param \stored_file $storedfile 871 * @return void 872 */ 873 protected function add_stored_file($document, $storedfile) { 874 $filedoc = $document->export_file_for_engine($storedfile); 875 876 if (!$this->file_is_indexable($storedfile)) { 877 // For files that we don't consider indexable, we will still place a reference in the search engine. 878 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE; 879 $this->add_solr_document($filedoc); 880 return; 881 } 882 883 $curl = $this->get_curl_object(); 884 885 $url = $this->get_connection_url('/update/extract'); 886 887 // This will prevent solr from automatically making fields for every tika output. 888 $url->param('uprefix', 'ignored_'); 889 890 // Control how content is captured. This will keep our file content clean of non-important metadata. 891 $url->param('captureAttr', 'true'); 892 // Move the content to a field for indexing. 893 $url->param('fmap.content', 'solr_filecontent'); 894 895 // These are common fields that matches the standard *_point dynamic field and causes an error. 896 $url->param('fmap.media_white_point', 'ignored_mwp'); 897 $url->param('fmap.media_black_point', 'ignored_mbp'); 898 899 // Copy each key to the url with literal. 900 // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names. 901 foreach ($filedoc as $key => $value) { 902 // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours. 903 $url->param('fmap.'.$key, 'ignored_'.$key); 904 // Place data in a tmp field. 905 $url->param('literal.mdltmp_'.$key, $value); 906 // Then move to the final field. 907 $url->param('fmap.mdltmp_'.$key, $key); 908 } 909 910 // This sets the true filename for Tika. 911 $url->param('resource.name', $storedfile->get_filename()); 912 913 // A giant block of code that is really just error checking around the curl request. 914 try { 915 // Now actually do the request. 916 $result = $curl->post($url->out(false), array('myfile' => $storedfile)); 917 918 $code = $curl->get_errno(); 919 $info = $curl->get_info(); 920 921 // Now error handling. It is just informational, since we aren't tracking per file/doc results. 922 if ($code != 0) { 923 // This means an internal cURL error occurred error is in result. 924 $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.'; 925 debugging($message, DEBUG_DEVELOPER); 926 } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) { 927 // Unexpected HTTP response code. 928 $message = 'Error while indexing file with document id '.$filedoc['id']; 929 // Try to get error message out of msg or title if it exists. 930 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) { 931 $message .= ': '.$matches[1]; 932 } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) { 933 $message .= ': '.$matches[1]; 934 } 935 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter. 936 if (CLI_SCRIPT && !PHPUNIT_TEST) { 937 mtrace($message); 938 } 939 } else { 940 // Check for the expected status field. 941 if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) { 942 // Now check for the expected status of 0, if not, error. 943 if ((int)$matches[1] !== 0) { 944 $message = 'Unexpected Solr status code '.(int)$matches[1]; 945 $message .= ' while indexing file with document id '.$filedoc['id'].'.'; 946 debugging($message, DEBUG_DEVELOPER); 947 } else { 948 // The document was successfully indexed. 949 return; 950 } 951 } else { 952 // We received an unprocessable response. 953 $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': '; 954 $message .= strtok($result, "\n"); 955 debugging($message, DEBUG_DEVELOPER); 956 } 957 } 958 } catch (\Exception $e) { 959 // There was an error, but we are not tracking per-file success, so we just continue on. 960 debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER); 961 } 962 963 // If we get here, the document was not indexed due to an error. So we will index just the base info without the file. 964 $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR; 965 $this->add_solr_document($filedoc); 966 } 967 968 /** 969 * Checks to see if a passed file is indexable. 970 * 971 * @param \stored_file $file The file to check 972 * @return bool True if the file can be indexed 973 */ 974 protected function file_is_indexable($file) { 975 if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) { 976 // The file is too big to index. 977 return false; 978 } 979 980 $mime = $file->get_mimetype(); 981 982 if ($mime == 'application/vnd.moodle.backup') { 983 // We don't index Moodle backup files. There is nothing usefully indexable in them. 984 return false; 985 } 986 987 return true; 988 } 989 990 /** 991 * Commits all pending changes. 992 * 993 * @return void 994 */ 995 protected function commit() { 996 $this->get_search_client()->commit(); 997 } 998 999 /** 1000 * Do any area cleanup needed, and do anything to confirm contents. 1001 * 1002 * Return false to prevent the search area completed time and stats from being updated. 1003 * 1004 * @param \core_search\base $searcharea The search area that was complete 1005 * @param int $numdocs The number of documents that were added to the index 1006 * @param bool $fullindex True if a full index is being performed 1007 * @return bool True means that data is considered indexed 1008 */ 1009 public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) { 1010 $this->commit(); 1011 1012 return true; 1013 } 1014 1015 /** 1016 * Return true if file indexing is supported and enabled. False otherwise. 1017 * 1018 * @return bool 1019 */ 1020 public function file_indexing_enabled() { 1021 return (bool)$this->config->fileindexing; 1022 } 1023 1024 /** 1025 * Defragments the index. 1026 * 1027 * @return void 1028 */ 1029 public function optimize() { 1030 $this->get_search_client()->optimize(1, true, false); 1031 } 1032 1033 /** 1034 * Deletes the specified document. 1035 * 1036 * @param string $id The document id to delete 1037 * @return void 1038 */ 1039 public function delete_by_id($id) { 1040 // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid. 1041 $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id); 1042 $this->commit(); 1043 } 1044 1045 /** 1046 * Delete all area's documents. 1047 * 1048 * @param string $areaid 1049 * @return void 1050 */ 1051 public function delete($areaid = null) { 1052 if ($areaid) { 1053 $this->get_search_client()->deleteByQuery('areaid:' . $areaid); 1054 } else { 1055 $this->get_search_client()->deleteByQuery('*:*'); 1056 } 1057 $this->commit(); 1058 } 1059 1060 /** 1061 * Pings the Solr server using search_solr config 1062 * 1063 * @return true|string Returns true if all good or an error string. 1064 */ 1065 public function is_server_ready() { 1066 1067 $configured = $this->is_server_configured(); 1068 if ($configured !== true) { 1069 return $configured; 1070 } 1071 1072 // Check that the schema is already set up. 1073 try { 1074 $schema = new \search_solr\schema(); 1075 $schema->validate_setup(); 1076 } catch (\moodle_exception $e) { 1077 return $e->getMessage(); 1078 } 1079 1080 return true; 1081 } 1082 1083 /** 1084 * Is the solr server properly configured?. 1085 * 1086 * @return true|string Returns true if all good or an error string. 1087 */ 1088 public function is_server_configured() { 1089 1090 if (empty($this->config->server_hostname) || empty($this->config->indexname)) { 1091 return 'No solr configuration found'; 1092 } 1093 1094 if (!$client = $this->get_search_client(false)) { 1095 return get_string('engineserverstatus', 'search'); 1096 } 1097 1098 try { 1099 if ($this->get_solr_major_version() < 4) { 1100 // Minimum solr 4.0. 1101 return get_string('minimumsolr4', 'search_solr'); 1102 } 1103 } catch (\SolrClientException $ex) { 1104 return 'Solr client error: ' . $ex->getMessage(); 1105 } catch (\SolrServerException $ex) { 1106 return 'Solr server error: ' . $ex->getMessage(); 1107 } 1108 1109 return true; 1110 } 1111 1112 /** 1113 * Returns the solr server major version. 1114 * 1115 * @return int 1116 */ 1117 public function get_solr_major_version() { 1118 $systemdata = $this->get_search_client()->system(); 1119 $solrversion = $systemdata->getResponse()->offsetGet('lucene')->offsetGet('solr-spec-version'); 1120 return intval(substr($solrversion, 0, strpos($solrversion, '.'))); 1121 } 1122 1123 /** 1124 * Checks if the PHP Solr extension is available. 1125 * 1126 * @return bool 1127 */ 1128 public function is_installed() { 1129 return function_exists('solr_get_version'); 1130 } 1131 1132 /** 1133 * Returns the solr client instance. 1134 * 1135 * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl. 1136 * 1137 * @throws \core_search\engine_exception 1138 * @param bool $triggerexception 1139 * @return \SolrClient 1140 */ 1141 protected function get_search_client($triggerexception = true) { 1142 1143 // Type comparison as it is set to false if not available. 1144 if ($this->client !== null) { 1145 return $this->client; 1146 } 1147 1148 $options = array( 1149 'hostname' => $this->config->server_hostname, 1150 'path' => '/solr/' . $this->config->indexname, 1151 'login' => !empty($this->config->server_username) ? $this->config->server_username : '', 1152 'password' => !empty($this->config->server_password) ? $this->config->server_password : '', 1153 'port' => !empty($this->config->server_port) ? $this->config->server_port : '', 1154 'secure' => !empty($this->config->secure) ? true : false, 1155 'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '', 1156 'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '', 1157 'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '', 1158 'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '', 1159 'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '', 1160 'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30' 1161 ); 1162 1163 if (!class_exists('\SolrClient')) { 1164 throw new \core_search\engine_exception('enginenotinstalled', 'search', '', 'solr'); 1165 } 1166 1167 $client = new \SolrClient($options); 1168 1169 if ($client === false && $triggerexception) { 1170 throw new \core_search\engine_exception('engineserverstatus', 'search'); 1171 } 1172 1173 if ($this->cacheclient) { 1174 $this->client = $client; 1175 } 1176 1177 return $client; 1178 } 1179 1180 /** 1181 * Returns a curl object for conntecting to solr. 1182 * 1183 * @return \curl 1184 */ 1185 public function get_curl_object() { 1186 if (!is_null($this->curl)) { 1187 return $this->curl; 1188 } 1189 1190 $this->curl = new \curl(); 1191 1192 $options = array(); 1193 // Build the SSL options. Based on pecl-solr and general testing. 1194 if (!empty($this->config->secure)) { 1195 if (!empty($this->config->ssl_cert)) { 1196 $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert; 1197 $options['CURLOPT_SSLCERTTYPE'] = 'PEM'; 1198 } 1199 1200 if (!empty($this->config->ssl_key)) { 1201 $options['CURLOPT_SSLKEY'] = $this->config->ssl_key; 1202 $options['CURLOPT_SSLKEYTYPE'] = 'PEM'; 1203 } 1204 1205 if (!empty($this->config->ssl_keypassword)) { 1206 $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword; 1207 } 1208 1209 if (!empty($this->config->ssl_cainfo)) { 1210 $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo; 1211 } 1212 1213 if (!empty($this->config->ssl_capath)) { 1214 $options['CURLOPT_CAPATH'] = $this->config->ssl_capath; 1215 } 1216 } 1217 1218 $this->curl->setopt($options); 1219 1220 if (!empty($this->config->server_username) && !empty($this->config->server_password)) { 1221 $authorization = $this->config->server_username . ':' . $this->config->server_password; 1222 $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization)); 1223 } 1224 1225 return $this->curl; 1226 } 1227 1228 /** 1229 * Return a Moodle url object for the server connection. 1230 * 1231 * @param string $path The solr path to append. 1232 * @return \moodle_url 1233 */ 1234 public function get_connection_url($path) { 1235 // Must use the proper protocol, or SSL will fail. 1236 $protocol = !empty($this->config->secure) ? 'https' : 'http'; 1237 $url = $protocol . '://' . rtrim($this->config->server_hostname, '/'); 1238 if (!empty($this->config->server_port)) { 1239 $url .= ':' . $this->config->server_port; 1240 } 1241 $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/'); 1242 1243 return new \moodle_url($url); 1244 } 1245 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |