[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/search/engine/solr/classes/ -> engine.php (source)

   1  <?php
   2  // This file is part of Moodle - http://moodle.org/
   3  //
   4  // Moodle is free software: you can redistribute it and/or modify
   5  // it under the terms of the GNU General Public License as published by
   6  // the Free Software Foundation, either version 3 of the License, or
   7  // (at your option) any later version.
   8  //
   9  // Moodle is distributed in the hope that it will be useful,
  10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  // GNU General Public License for more details.
  13  //
  14  // You should have received a copy of the GNU General Public License
  15  // along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
  16  
  17  /**
  18   * Solr engine.
  19   *
  20   * @package    search_solr
  21   * @copyright  2015 Daniel Neis Araujo
  22   * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  23   */
  24  
  25  namespace search_solr;
  26  
  27  defined('MOODLE_INTERNAL') || die();
  28  
  29  /**
  30   * Solr engine.
  31   *
  32   * @package    search_solr
  33   * @copyright  2015 Daniel Neis Araujo
  34   * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  35   */
  36  class engine extends \core_search\engine {
  37  
  38      /**
  39       * @var string The date format used by solr.
  40       */
  41      const DATE_FORMAT = 'Y-m-d\TH:i:s\Z';
  42  
  43      /**
  44       * @var int Commit documents interval (number of miliseconds).
  45       */
  46      const AUTOCOMMIT_WITHIN = 15000;
  47  
  48      /**
  49       * The maximum number of results to fetch at a time.
  50       */
  51      const QUERY_SIZE = 120;
  52  
  53      /**
  54       * Highlighting fragsize. Slightly larger than output size (500) to allow for ... appending.
  55       */
  56      const FRAG_SIZE = 510;
  57  
  58      /**
  59       * Marker for the start of a highlight.
  60       */
  61      const HIGHLIGHT_START = '@@HI_S@@';
  62  
  63      /**
  64       * Marker for the end of a highlight.
  65       */
  66      const HIGHLIGHT_END = '@@HI_E@@';
  67  
  68      /**
  69       * @var \SolrClient
  70       */
  71      protected $client = null;
  72  
  73      /**
  74       * @var bool True if we should reuse SolrClients, false if not.
  75       */
  76      protected $cacheclient = true;
  77  
  78      /**
  79       * @var \curl Direct curl object.
  80       */
  81      protected $curl = null;
  82  
  83      /**
  84       * @var array Fields that can be highlighted.
  85       */
  86      protected $highlightfields = array('title', 'content', 'description1', 'description2');
  87  
  88      /**
  89       * @var int Number of total docs reported by Sorl for the last query.
  90       */
  91      protected $totalenginedocs = 0;
  92  
  93      /**
  94       * @var int Number of docs we have processed for the last query.
  95       */
  96      protected $processeddocs = 0;
  97  
  98      /**
  99       * @var int Number of docs that have been skipped while processing the last query.
 100       */
 101      protected $skippeddocs = 0;
 102  
 103      /**
 104       * Initialises the search engine configuration.
 105       *
 106       * @return void
 107       */
 108      public function __construct() {
 109          parent::__construct();
 110  
 111          $curlversion = curl_version();
 112          if (isset($curlversion['version']) && stripos($curlversion['version'], '7.35.') === 0) {
 113              // There is a flaw with curl 7.35.0 that causes problems with client reuse.
 114              $this->cacheclient = false;
 115          }
 116      }
 117  
 118      /**
 119       * Prepares a Solr query, applies filters and executes it returning its results.
 120       *
 121       * @throws \core_search\engine_exception
 122       * @param  stdClass  $filters Containing query and filters.
 123       * @param  array     $usercontexts Contexts where the user has access. True if the user can access all contexts.
 124       * @param  int       $limit The maximum number of results to return.
 125       * @return \core_search\document[] Results or false if no results
 126       */
 127      public function execute_query($filters, $usercontexts, $limit = 0) {
 128          global $USER;
 129  
 130          if (empty($limit)) {
 131              $limit = \core_search\manager::MAX_RESULTS;
 132          }
 133  
 134          // If there is any problem we trigger the exception as soon as possible.
 135          $client = $this->get_search_client();
 136  
 137          // Create the query object.
 138          $query = $this->create_user_query($filters, $usercontexts);
 139  
 140          // We expect good match rates, so for our first get, we will get a small number of records.
 141          // This significantly speeds solr response time for first few pages.
 142          $query->setRows(min($limit * 3, static::QUERY_SIZE));
 143          $response = $this->get_query_response($query);
 144  
 145          // Get count data out of the response, and reset our counters.
 146          list($included, $found) = $this->get_response_counts($response);
 147          $this->totalenginedocs = $found;
 148          $this->processeddocs = 0;
 149          $this->skippeddocs = 0;
 150          if ($included == 0 || $this->totalenginedocs == 0) {
 151              // No results.
 152              return array();
 153          }
 154  
 155          // Get valid documents out of the response.
 156          $results = $this->process_response($response, $limit);
 157  
 158          // We have processed all the docs in the response at this point.
 159          $this->processeddocs += $included;
 160  
 161          // If we haven't reached the limit, and there are more docs left in Solr, lets keep trying.
 162          while (count($results) < $limit && ($this->totalenginedocs - $this->processeddocs) > 0) {
 163              // Offset the start of the query, and since we are making another call, get more per call.
 164              $query->setStart($this->processeddocs);
 165              $query->setRows(static::QUERY_SIZE);
 166  
 167              $response = $this->get_query_response($query);
 168              list($included, $found) = $this->get_response_counts($response);
 169              if ($included == 0 || $found == 0) {
 170                  // No new results were found. Found being empty would be weird, so we will just return.
 171                  return $results;
 172              }
 173              $this->totalenginedocs = $found;
 174  
 175              // Get the new response docs, limiting to remaining we need, then add it to the end of the results array.
 176              $newdocs = $this->process_response($response, $limit - count($results));
 177              $results = array_merge($results, $newdocs);
 178  
 179              // Add to our processed docs count.
 180              $this->processeddocs += $included;
 181          }
 182  
 183          return $results;
 184      }
 185  
 186      /**
 187       * Takes a query and returns the response in SolrObject format.
 188       *
 189       * @param  SolrQuery  $query Solr query object.
 190       * @return SolrObject|false Response document or false on error.
 191       */
 192      protected function get_query_response($query) {
 193          try {
 194              return $this->get_search_client()->query($query)->getResponse();
 195          } catch (\SolrClientException $ex) {
 196              debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
 197              $this->queryerror = $ex->getMessage();
 198              return false;
 199          } catch (\SolrServerException $ex) {
 200              debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
 201              $this->queryerror = $ex->getMessage();
 202              return false;
 203          }
 204      }
 205  
 206      /**
 207       * Returns the total number of documents available for the most recently call to execute_query.
 208       *
 209       * @return int
 210       */
 211      public function get_query_total_count() {
 212          // Return the total engine count minus the docs we have determined are bad.
 213          return $this->totalenginedocs - $this->skippeddocs;
 214      }
 215  
 216      /**
 217       * Returns count information for a provided response. Will return 0, 0 for invalid or empty responses.
 218       *
 219       * @param SolrDocument $response The response document from Solr.
 220       * @return array A two part array. First how many response docs are in the response.
 221       *               Second, how many results are vailable in the engine.
 222       */
 223      protected function get_response_counts($response) {
 224          $found = 0;
 225          $included = 0;
 226  
 227          if (isset($response->grouped->solr_filegroupingid->ngroups)) {
 228              // Get the number of results for file grouped queries.
 229              $found = $response->grouped->solr_filegroupingid->ngroups;
 230              $included = count($response->grouped->solr_filegroupingid->groups);
 231          } else if (isset($response->response->numFound)) {
 232              // Get the number of results for standard queries.
 233              $found = $response->response->numFound;
 234              $included = count($response->response->docs);
 235          }
 236  
 237          return array($included, $found);
 238      }
 239  
 240      /**
 241       * Prepares a new query object with needed limits, filters, etc.
 242       *
 243       * @param stdClass  $filters Containing query and filters.
 244       * @param array     $usercontexts Contexts where the user has access. True if the user can access all contexts.
 245       * @return SolrDisMaxQuery
 246       */
 247      protected function create_user_query($filters, $usercontexts) {
 248          global $USER;
 249  
 250          // Let's keep these changes internal.
 251          $data = clone $filters;
 252  
 253          $query = new \SolrDisMaxQuery();
 254  
 255          $this->set_query($query, $data->q);
 256          $this->add_fields($query);
 257  
 258          // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
 259          // we are really interested in caching contexts filters instead.
 260          if (!empty($data->title)) {
 261              $query->addFilterQuery('{!field cache=false f=title}' . $data->title);
 262          }
 263          if (!empty($data->areaids)) {
 264              // If areaids are specified, we want to get any that match.
 265              $query->addFilterQuery('{!cache=false}areaid:(' . implode(' OR ', $data->areaids) . ')');
 266          }
 267          if (!empty($data->courseids)) {
 268              $query->addFilterQuery('{!cache=false}courseid:(' . implode(' OR ', $data->courseids) . ')');
 269          }
 270  
 271          if (!empty($data->timestart) or !empty($data->timeend)) {
 272              if (empty($data->timestart)) {
 273                  $data->timestart = '*';
 274              } else {
 275                  $data->timestart = \search_solr\document::format_time_for_engine($data->timestart);
 276              }
 277              if (empty($data->timeend)) {
 278                  $data->timeend = '*';
 279              } else {
 280                  $data->timeend = \search_solr\document::format_time_for_engine($data->timeend);
 281              }
 282  
 283              // No cache.
 284              $query->addFilterQuery('{!cache=false}modified:[' . $data->timestart . ' TO ' . $data->timeend . ']');
 285          }
 286  
 287          // Restrict to users who are supposed to be able to see a particular result.
 288          $query->addFilterQuery('owneruserid:(' . \core_search\manager::NO_OWNER_ID . ' OR ' . $USER->id . ')');
 289  
 290          // And finally restrict it to the context where the user can access, we want this one cached.
 291          // If the user can access all contexts $usercontexts value is just true, we don't need to filter
 292          // in that case.
 293          if ($usercontexts && is_array($usercontexts)) {
 294              // Join all area contexts into a single array and implode.
 295              $allcontexts = array();
 296              foreach ($usercontexts as $areaid => $areacontexts) {
 297                  if (!empty($data->areaids) && !in_array($areaid, $data->areaids)) {
 298                      // Skip unused areas.
 299                      continue;
 300                  }
 301                  foreach ($areacontexts as $contextid) {
 302                      // Ensure they are unique.
 303                      $allcontexts[$contextid] = $contextid;
 304                  }
 305              }
 306              if (empty($allcontexts)) {
 307                  // This means there are no valid contexts for them, so they get no results.
 308                  return array();
 309              }
 310              $query->addFilterQuery('contextid:(' . implode(' OR ', $allcontexts) . ')');
 311          }
 312  
 313          if ($this->file_indexing_enabled()) {
 314              // Now group records by solr_filegroupingid. Limit to 3 results per group.
 315              $query->setGroup(true);
 316              $query->setGroupLimit(3);
 317              $query->setGroupNGroups(true);
 318              $query->addGroupField('solr_filegroupingid');
 319          } else {
 320              // Make sure we only get text files, in case the index has pre-existing files.
 321              $query->addFilterQuery('type:'.\core_search\manager::TYPE_TEXT);
 322          }
 323  
 324          return $query;
 325      }
 326  
 327      /**
 328       * Prepares a new query by setting the query, start offset and rows to return.
 329       *
 330       * @param SolrQuery $query
 331       * @param object    $q Containing query and filters.
 332       */
 333      protected function set_query($query, $q) {
 334          // Set hightlighting.
 335          $query->setHighlight(true);
 336          foreach ($this->highlightfields as $field) {
 337              $query->addHighlightField($field);
 338          }
 339          $query->setHighlightFragsize(static::FRAG_SIZE);
 340          $query->setHighlightSimplePre(self::HIGHLIGHT_START);
 341          $query->setHighlightSimplePost(self::HIGHLIGHT_END);
 342          $query->setHighlightMergeContiguous(true);
 343  
 344          $query->setQuery($q);
 345  
 346          // A reasonable max.
 347          $query->setRows(static::QUERY_SIZE);
 348      }
 349  
 350      /**
 351       * Sets fields to be returned in the result.
 352       *
 353       * @param SolrDisMaxQuery|SolrQuery $query object.
 354       */
 355      public function add_fields($query) {
 356          $documentclass = $this->get_document_classname();
 357          $fields = $documentclass::get_default_fields_definition();
 358  
 359          $dismax = false;
 360          if ($query instanceof \SolrDisMaxQuery) {
 361              $dismax = true;
 362          }
 363  
 364          foreach ($fields as $key => $field) {
 365              $query->addField($key);
 366              if ($dismax && !empty($field['mainquery'])) {
 367                  // Add fields the main query should be run against.
 368                  $query->addQueryField($key);
 369              }
 370          }
 371      }
 372  
 373      /**
 374       * Finds the key common to both highlighing and docs array returned from response.
 375       * @param object $response containing results.
 376       */
 377      public function add_highlight_content($response) {
 378          if (!isset($response->highlighting)) {
 379              // There is no highlighting to add.
 380              return;
 381          }
 382  
 383          $highlightedobject = $response->highlighting;
 384          foreach ($response->response->docs as $doc) {
 385              $x = $doc->id;
 386              $highlighteddoc = $highlightedobject->$x;
 387              $this->merge_highlight_field_values($doc, $highlighteddoc);
 388          }
 389      }
 390  
 391      /**
 392       * Adds the highlighting array values to docs array values.
 393       *
 394       * @throws \core_search\engine_exception
 395       * @param object $doc containing the results.
 396       * @param object $highlighteddoc containing the highlighted results values.
 397       */
 398      public function merge_highlight_field_values($doc, $highlighteddoc) {
 399  
 400          foreach ($this->highlightfields as $field) {
 401              if (!empty($doc->$field)) {
 402  
 403                  // Check that the returned value is not an array. No way we can make this work with multivalued solr fields.
 404                  if (is_array($doc->{$field})) {
 405                      throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $field);
 406                  }
 407  
 408                  if (!empty($highlighteddoc->$field)) {
 409                      // Replace by the highlighted result.
 410                      $doc->$field = reset($highlighteddoc->$field);
 411                  }
 412              }
 413          }
 414      }
 415  
 416      /**
 417       * Filters the response on Moodle side.
 418       *
 419       * @param SolrObject $response Solr object containing the response return from solr server.
 420       * @param int        $limit The maximum number of results to return. 0 for all.
 421       * @param bool       $skipaccesscheck Don't use check_access() on results. Only to be used when results have known access.
 422       * @return array $results containing final results to be displayed.
 423       */
 424      protected function process_response($response, $limit = 0, $skipaccesscheck = false) {
 425          global $USER;
 426  
 427          if (empty($response)) {
 428              return array();
 429          }
 430  
 431          if (isset($response->grouped)) {
 432              return $this->grouped_files_process_response($response, $limit);
 433          }
 434  
 435          $userid = $USER->id;
 436          $noownerid = \core_search\manager::NO_OWNER_ID;
 437  
 438          $numgranted = 0;
 439  
 440          if (!$docs = $response->response->docs) {
 441              return array();
 442          }
 443  
 444          $out = array();
 445          if (!empty($response->response->numFound)) {
 446              $this->add_highlight_content($response);
 447  
 448              // Iterate through the results checking its availability and whether they are available for the user or not.
 449              foreach ($docs as $key => $docdata) {
 450                  if ($docdata['owneruserid'] != $noownerid && $docdata['owneruserid'] != $userid) {
 451                      // If owneruserid is set, no other user should be able to access this record.
 452                      continue;
 453                  }
 454  
 455                  if (!$searcharea = $this->get_search_area($docdata->areaid)) {
 456                      continue;
 457                  }
 458  
 459                  $docdata = $this->standarize_solr_obj($docdata);
 460  
 461                  if ($skipaccesscheck) {
 462                      $access = \core_search\manager::ACCESS_GRANTED;
 463                  } else {
 464                      $access = $searcharea->check_access($docdata['itemid']);
 465                  }
 466                  switch ($access) {
 467                      case \core_search\manager::ACCESS_DELETED:
 468                          $this->delete_by_id($docdata['id']);
 469                          // Remove one from our processed and total counters, since we promptly deleted.
 470                          $this->processeddocs--;
 471                          $this->totalenginedocs--;
 472                          break;
 473                      case \core_search\manager::ACCESS_DENIED:
 474                          $this->skippeddocs++;
 475                          break;
 476                      case \core_search\manager::ACCESS_GRANTED:
 477                          $numgranted++;
 478  
 479                          // Add the doc.
 480                          $out[] = $this->to_document($searcharea, $docdata);
 481                          break;
 482                  }
 483  
 484                  // Stop when we hit our limit.
 485                  if (!empty($limit) && count($out) >= $limit) {
 486                      break;
 487                  }
 488              }
 489          }
 490  
 491          return $out;
 492      }
 493  
 494      /**
 495       * Processes grouped file results into documents, with attached matching files.
 496       *
 497       * @param SolrObject $response The response returned from solr server
 498       * @param int        $limit The maximum number of results to return. 0 for all.
 499       * @return array Final results to be displayed.
 500       */
 501      protected function grouped_files_process_response($response, $limit = 0) {
 502          // If we can't find the grouping, or there are no matches in the grouping, return empty.
 503          if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
 504              return array();
 505          }
 506  
 507          $numgranted = 0;
 508          $orderedids = array();
 509          $completedocs = array();
 510          $incompletedocs = array();
 511  
 512          $highlightingobj = $response->highlighting;
 513  
 514          // Each group represents a "master document".
 515          $groups = $response->grouped->solr_filegroupingid->groups;
 516          foreach ($groups as $group) {
 517              $groupid = $group->groupValue;
 518              $groupdocs = $group->doclist->docs;
 519              $firstdoc = reset($groupdocs);
 520  
 521              if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
 522                  // Well, this is a problem.
 523                  continue;
 524              }
 525  
 526              // Check for access.
 527              $access = $searcharea->check_access($firstdoc->itemid);
 528              switch ($access) {
 529                  case \core_search\manager::ACCESS_DELETED:
 530                      // If deleted from Moodle, delete from index and then continue.
 531                      $this->delete_by_id($firstdoc->id);
 532                      // Remove one from our processed and total counters, since we promptly deleted.
 533                      $this->processeddocs--;
 534                      $this->totalenginedocs--;
 535                      continue 2;
 536                      break;
 537                  case \core_search\manager::ACCESS_DENIED:
 538                      // This means we should just skip for the current user.
 539                      $this->skippeddocs++;
 540                      continue 2;
 541                      break;
 542              }
 543              $numgranted++;
 544  
 545              $maindoc = false;
 546              $fileids = array();
 547              // Seperate the main document and any files returned.
 548              foreach ($groupdocs as $groupdoc) {
 549                  if ($groupdoc->id == $groupid) {
 550                      $maindoc = $groupdoc;
 551                  } else if (isset($groupdoc->solr_fileid)) {
 552                      $fileids[] = $groupdoc->solr_fileid;
 553                  }
 554              }
 555  
 556              // Store the id of this group, in order, for later merging.
 557              $orderedids[] = $groupid;
 558  
 559              if (!$maindoc) {
 560                  // We don't have the main doc, store what we know for later building.
 561                  $incompletedocs[$groupid] = $fileids;
 562              } else {
 563                  if (isset($highlightingobj->$groupid)) {
 564                      // Merge the highlighting for this doc.
 565                      $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
 566                  }
 567                  $docdata = $this->standarize_solr_obj($maindoc);
 568                  $doc = $this->to_document($searcharea, $docdata);
 569                  // Now we need to attach the result files to the doc.
 570                  foreach ($fileids as $fileid) {
 571                      $doc->add_stored_file($fileid);
 572                  }
 573                  $completedocs[$groupid] = $doc;
 574              }
 575  
 576              if (!empty($limit) && $numgranted >= $limit) {
 577                  // We have hit the max results, we will just ignore the rest.
 578                  break;
 579              }
 580          }
 581  
 582          $incompletedocs = $this->get_missing_docs($incompletedocs);
 583  
 584          $out = array();
 585          // Now merge the complete and incomplete documents, in results order.
 586          foreach ($orderedids as $docid) {
 587              if (isset($completedocs[$docid])) {
 588                  $out[] = $completedocs[$docid];
 589              } else if (isset($incompletedocs[$docid])) {
 590                  $out[] = $incompletedocs[$docid];
 591              }
 592          }
 593  
 594          return $out;
 595      }
 596  
 597      /**
 598       * Retreive any missing main documents and attach provided files.
 599       *
 600       * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
 601       * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
 602       *
 603       * Return array also indexed by document id.
 604       *
 605       * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
 606       * @return document[]
 607       */
 608      protected function get_missing_docs($missingdocs) {
 609          if (empty($missingdocs)) {
 610              return array();
 611          }
 612  
 613          $docids = array_keys($missingdocs);
 614  
 615          // Build a custom query that will get all the missing documents.
 616          $query = new \SolrQuery();
 617          $this->set_query($query, '*');
 618          $this->add_fields($query);
 619          $query->setRows(count($docids));
 620          $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
 621  
 622          $response = $this->get_query_response($query);
 623          // We know the missing docs have already been checked for access, so don't recheck.
 624          $results = $this->process_response($response, 0, true);
 625  
 626          $out = array();
 627          foreach ($results as $result) {
 628              $resultid = $result->get('id');
 629              if (!isset($missingdocs[$resultid])) {
 630                  // We got a result we didn't expect. Skip it.
 631                  continue;
 632              }
 633              // Attach the files.
 634              foreach ($missingdocs[$resultid] as $filedoc) {
 635                  $result->add_stored_file($filedoc);
 636              }
 637              $out[$resultid] = $result;
 638          }
 639  
 640          return $out;
 641      }
 642  
 643      /**
 644       * Returns a standard php array from a \SolrObject instance.
 645       *
 646       * @param \SolrObject $obj
 647       * @return array The returned document as an array.
 648       */
 649      public function standarize_solr_obj(\SolrObject $obj) {
 650          $properties = $obj->getPropertyNames();
 651  
 652          $docdata = array();
 653          foreach($properties as $name) {
 654              // http://php.net/manual/en/solrobject.getpropertynames.php#98018.
 655              $name = trim($name);
 656              $docdata[$name] = $obj->offsetGet($name);
 657          }
 658          return $docdata;
 659      }
 660  
 661      /**
 662       * Adds a document to the search engine.
 663       *
 664       * This does not commit to the search engine.
 665       *
 666       * @param document $document
 667       * @param bool     $fileindexing True if file indexing is to be used
 668       * @return bool
 669       */
 670      public function add_document($document, $fileindexing = false) {
 671          $docdata = $document->export_for_engine();
 672  
 673          if (!$this->add_solr_document($docdata)) {
 674              return false;
 675          }
 676  
 677          if ($fileindexing) {
 678              // This will take care of updating all attached files in the index.
 679              $this->process_document_files($document);
 680          }
 681  
 682          return true;
 683      }
 684  
 685      /**
 686       * Adds a text document to the search engine.
 687       *
 688       * @param array $doc
 689       * @return bool
 690       */
 691      protected function add_solr_document($doc) {
 692          $solrdoc = new \SolrInputDocument();
 693          foreach ($doc as $field => $value) {
 694              $solrdoc->addField($field, $value);
 695          }
 696  
 697          try {
 698              $result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
 699              return true;
 700          } catch (\SolrClientException $e) {
 701              debugging('Solr client error adding document with id ' . $doc['id'] . ': ' . $e->getMessage(), DEBUG_DEVELOPER);
 702          } catch (\SolrServerException $e) {
 703              // We only use the first line of the message, as it's a fully java stacktrace behind it.
 704              $msg = strtok($e->getMessage(), "\n");
 705              debugging('Solr server error adding document with id ' . $doc['id'] . ': ' . $msg, DEBUG_DEVELOPER);
 706          }
 707  
 708          return false;
 709      }
 710  
 711      /**
 712       * Index files attached to the docuemnt, ensuring the index matches the current document files.
 713       *
 714       * For documents that aren't known to be new, we check the index for existing files.
 715       * - New files we will add.
 716       * - Existing and unchanged files we will skip.
 717       * - File that are in the index but not on the document will be deleted from the index.
 718       * - Files that have changed will be re-indexed.
 719       *
 720       * @param document $document
 721       */
 722      protected function process_document_files($document) {
 723          if (!$this->file_indexing_enabled()) {
 724              return;
 725          }
 726  
 727          // Maximum rows to process at a time.
 728          $rows = 500;
 729  
 730          // Get the attached files.
 731          $files = $document->get_files();
 732  
 733          // If this isn't a new document, we need to check the exiting indexed files.
 734          if (!$document->get_is_new()) {
 735              // We do this progressively, so we can handle lots of files cleanly.
 736              list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
 737              $count = 0;
 738              $idstodelete = array();
 739  
 740              do {
 741                  // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
 742                  foreach ($indexedfiles as $indexedfile) {
 743                      $fileid = $indexedfile->solr_fileid;
 744  
 745                      if (isset($files[$fileid])) {
 746                          // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
 747                          // Filelib does not guarantee time modified is updated, so we will check important values.
 748                          if ($indexedfile->modified < $files[$fileid]->get_timemodified()) {
 749                              continue;
 750                          }
 751                          if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
 752                              continue;
 753                          }
 754                          if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
 755                              continue;
 756                          }
 757                          if ($indexedfile->solr_fileindexstatus == document::INDEXED_FILE_FALSE &&
 758                                  $this->file_is_indexable($files[$fileid])) {
 759                              // This means that the last time we indexed this file, filtering blocked it.
 760                              // Current settings say it is indexable, so we will allow it to be indexed.
 761                              continue;
 762                          }
 763  
 764                          // If the file is already indexed, we can just remove it from the files array and skip it.
 765                          unset($files[$fileid]);
 766                      } else {
 767                          // This means we have found a file that is no longer attached, so we need to delete from the index.
 768                          // We do it later, since this is progressive, and it could reorder results.
 769                          $idstodelete[] = $indexedfile->id;
 770                      }
 771                  }
 772                  $count += $rows;
 773  
 774                  if ($count < $numfound) {
 775                      // If we haven't hit the total count yet, fetch the next batch.
 776                      list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
 777                  }
 778  
 779              } while ($count < $numfound);
 780  
 781              // Delete files that are no longer attached.
 782              foreach ($idstodelete as $id) {
 783                  // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
 784                  $this->get_search_client()->deleteById($id);
 785              }
 786          }
 787  
 788          // Now we can actually index all the remaining files.
 789          foreach ($files as $file) {
 790              $this->add_stored_file($document, $file);
 791          }
 792      }
 793  
 794      /**
 795       * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
 796       *
 797       * @param document $document
 798       * @param int      $start The row to start the results on. Zero indexed.
 799       * @param int      $rows The number of rows to fetch
 800       * @return array   A two element array, the first is the total number of availble results, the second is an array
 801       *                 of documents for the current request.
 802       */
 803      protected function get_indexed_files($document, $start = 0, $rows = 500) {
 804          // Build a custom query that will get any document files that are in our solr_filegroupingid.
 805          $query = new \SolrQuery();
 806  
 807          // We want to get all file records tied to a document.
 808          // For efficiency, we are building our own, stripped down, query.
 809          $query->setQuery('*');
 810          $query->setRows($rows);
 811          $query->setStart($start);
 812          // We want a consistent sorting.
 813          $query->addSortField('id');
 814  
 815          // We only want the bare minimum of fields.
 816          $query->addField('id');
 817          $query->addField('modified');
 818          $query->addField('title');
 819          $query->addField('solr_fileid');
 820          $query->addField('solr_filecontenthash');
 821          $query->addField('solr_fileindexstatus');
 822  
 823          $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
 824          $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
 825  
 826          $response = $this->get_query_response($query);
 827          if (empty($response->response->numFound)) {
 828              return array(0, array());
 829          }
 830  
 831          return array($response->response->numFound, $this->convert_file_results($response));
 832      }
 833  
 834      /**
 835       * A very lightweight handler for getting information about already indexed files from a Solr response.
 836       *
 837       * @param SolrObject $responsedoc A Solr response document
 838       * @return stdClass[] An array of objects that contain the basic information for file processing.
 839       */
 840      protected function convert_file_results($responsedoc) {
 841          if (!$docs = $responsedoc->response->docs) {
 842              return array();
 843          }
 844  
 845          $out = array();
 846  
 847          foreach ($docs as $doc) {
 848              // Copy the bare minimim needed info.
 849              $result = new \stdClass();
 850              $result->id = $doc->id;
 851              $result->modified = document::import_time_from_engine($doc->modified);
 852              $result->title = $doc->title;
 853              $result->solr_fileid = $doc->solr_fileid;
 854              $result->solr_filecontenthash = $doc->solr_filecontenthash;
 855              $result->solr_fileindexstatus = $doc->solr_fileindexstatus;
 856              $out[] = $result;
 857          }
 858  
 859          return $out;
 860      }
 861  
 862      /**
 863       * Adds a file to the search engine.
 864       *
 865       * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
 866       * Tika has much better content type detection than Moodle, and we will have many more doc failures
 867       * if we try to send mime types.
 868       *
 869       * @param document $document
 870       * @param \stored_file $storedfile
 871       * @return void
 872       */
 873      protected function add_stored_file($document, $storedfile) {
 874          $filedoc = $document->export_file_for_engine($storedfile);
 875  
 876          if (!$this->file_is_indexable($storedfile)) {
 877              // For files that we don't consider indexable, we will still place a reference in the search engine.
 878              $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
 879              $this->add_solr_document($filedoc);
 880              return;
 881          }
 882  
 883          $curl = $this->get_curl_object();
 884  
 885          $url = $this->get_connection_url('/update/extract');
 886  
 887          // This will prevent solr from automatically making fields for every tika output.
 888          $url->param('uprefix', 'ignored_');
 889  
 890          // Control how content is captured. This will keep our file content clean of non-important metadata.
 891          $url->param('captureAttr', 'true');
 892          // Move the content to a field for indexing.
 893          $url->param('fmap.content', 'solr_filecontent');
 894  
 895          // These are common fields that matches the standard *_point dynamic field and causes an error.
 896          $url->param('fmap.media_white_point', 'ignored_mwp');
 897          $url->param('fmap.media_black_point', 'ignored_mbp');
 898  
 899          // Copy each key to the url with literal.
 900          // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
 901          foreach ($filedoc as $key => $value) {
 902              // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
 903              $url->param('fmap.'.$key, 'ignored_'.$key);
 904              // Place data in a tmp field.
 905              $url->param('literal.mdltmp_'.$key, $value);
 906              // Then move to the final field.
 907              $url->param('fmap.mdltmp_'.$key, $key);
 908          }
 909  
 910          // This sets the true filename for Tika.
 911          $url->param('resource.name', $storedfile->get_filename());
 912  
 913          // A giant block of code that is really just error checking around the curl request.
 914          try {
 915              // Now actually do the request.
 916              $result = $curl->post($url->out(false), array('myfile' => $storedfile));
 917  
 918              $code = $curl->get_errno();
 919              $info = $curl->get_info();
 920  
 921              // Now error handling. It is just informational, since we aren't tracking per file/doc results.
 922              if ($code != 0) {
 923                  // This means an internal cURL error occurred error is in result.
 924                  $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
 925                  debugging($message, DEBUG_DEVELOPER);
 926              } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
 927                  // Unexpected HTTP response code.
 928                  $message = 'Error while indexing file with document id '.$filedoc['id'];
 929                  // Try to get error message out of msg or title if it exists.
 930                  if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
 931                      $message .= ': '.$matches[1];
 932                  } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
 933                      $message .= ': '.$matches[1];
 934                  }
 935                  // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
 936                  if (CLI_SCRIPT && !PHPUNIT_TEST) {
 937                      mtrace($message);
 938                  }
 939              } else {
 940                  // Check for the expected status field.
 941                  if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
 942                      // Now check for the expected status of 0, if not, error.
 943                      if ((int)$matches[1] !== 0) {
 944                          $message = 'Unexpected Solr status code '.(int)$matches[1];
 945                          $message .= ' while indexing file with document id '.$filedoc['id'].'.';
 946                          debugging($message, DEBUG_DEVELOPER);
 947                      } else {
 948                          // The document was successfully indexed.
 949                          return;
 950                      }
 951                  } else {
 952                      // We received an unprocessable response.
 953                      $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
 954                      $message .= strtok($result, "\n");
 955                      debugging($message, DEBUG_DEVELOPER);
 956                  }
 957              }
 958          } catch (\Exception $e) {
 959              // There was an error, but we are not tracking per-file success, so we just continue on.
 960              debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
 961          }
 962  
 963          // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
 964          $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
 965          $this->add_solr_document($filedoc);
 966      }
 967  
 968      /**
 969       * Checks to see if a passed file is indexable.
 970       *
 971       * @param \stored_file $file The file to check
 972       * @return bool True if the file can be indexed
 973       */
 974      protected function file_is_indexable($file) {
 975          if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
 976              // The file is too big to index.
 977              return false;
 978          }
 979  
 980          $mime = $file->get_mimetype();
 981  
 982          if ($mime == 'application/vnd.moodle.backup') {
 983              // We don't index Moodle backup files. There is nothing usefully indexable in them.
 984              return false;
 985          }
 986  
 987          return true;
 988      }
 989  
 990      /**
 991       * Commits all pending changes.
 992       *
 993       * @return void
 994       */
 995      protected function commit() {
 996          $this->get_search_client()->commit();
 997      }
 998  
 999      /**
1000       * Do any area cleanup needed, and do anything to confirm contents.
1001       *
1002       * Return false to prevent the search area completed time and stats from being updated.
1003       *
1004       * @param \core_search\base $searcharea The search area that was complete
1005       * @param int $numdocs The number of documents that were added to the index
1006       * @param bool $fullindex True if a full index is being performed
1007       * @return bool True means that data is considered indexed
1008       */
1009      public function area_index_complete($searcharea, $numdocs = 0, $fullindex = false) {
1010          $this->commit();
1011  
1012          return true;
1013      }
1014  
1015      /**
1016       * Return true if file indexing is supported and enabled. False otherwise.
1017       *
1018       * @return bool
1019       */
1020      public function file_indexing_enabled() {
1021          return (bool)$this->config->fileindexing;
1022      }
1023  
1024      /**
1025       * Defragments the index.
1026       *
1027       * @return void
1028       */
1029      public function optimize() {
1030          $this->get_search_client()->optimize(1, true, false);
1031      }
1032  
1033      /**
1034       * Deletes the specified document.
1035       *
1036       * @param string $id The document id to delete
1037       * @return void
1038       */
1039      public function delete_by_id($id) {
1040          // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
1041          $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
1042          $this->commit();
1043      }
1044  
1045      /**
1046       * Delete all area's documents.
1047       *
1048       * @param string $areaid
1049       * @return void
1050       */
1051      public function delete($areaid = null) {
1052          if ($areaid) {
1053              $this->get_search_client()->deleteByQuery('areaid:' . $areaid);
1054          } else {
1055              $this->get_search_client()->deleteByQuery('*:*');
1056          }
1057          $this->commit();
1058      }
1059  
1060      /**
1061       * Pings the Solr server using search_solr config
1062       *
1063       * @return true|string Returns true if all good or an error string.
1064       */
1065      public function is_server_ready() {
1066  
1067          $configured = $this->is_server_configured();
1068          if ($configured !== true) {
1069              return $configured;
1070          }
1071  
1072          // Check that the schema is already set up.
1073          try {
1074              $schema = new \search_solr\schema();
1075              $schema->validate_setup();
1076          } catch (\moodle_exception $e) {
1077              return $e->getMessage();
1078          }
1079  
1080          return true;
1081      }
1082  
1083      /**
1084       * Is the solr server properly configured?.
1085       *
1086       * @return true|string Returns true if all good or an error string.
1087       */
1088      public function is_server_configured() {
1089  
1090          if (empty($this->config->server_hostname) || empty($this->config->indexname)) {
1091              return 'No solr configuration found';
1092          }
1093  
1094          if (!$client = $this->get_search_client(false)) {
1095              return get_string('engineserverstatus', 'search');
1096          }
1097  
1098          try {
1099              if ($this->get_solr_major_version() < 4) {
1100                  // Minimum solr 4.0.
1101                  return get_string('minimumsolr4', 'search_solr');
1102              }
1103          } catch (\SolrClientException $ex) {
1104              return 'Solr client error: ' . $ex->getMessage();
1105          } catch (\SolrServerException $ex) {
1106              return 'Solr server error: ' . $ex->getMessage();
1107          }
1108  
1109          return true;
1110      }
1111  
1112      /**
1113       * Returns the solr server major version.
1114       *
1115       * @return int
1116       */
1117      public function get_solr_major_version() {
1118          $systemdata = $this->get_search_client()->system();
1119          $solrversion = $systemdata->getResponse()->offsetGet('lucene')->offsetGet('solr-spec-version');
1120          return intval(substr($solrversion, 0, strpos($solrversion, '.')));
1121      }
1122  
1123      /**
1124       * Checks if the PHP Solr extension is available.
1125       *
1126       * @return bool
1127       */
1128      public function is_installed() {
1129          return function_exists('solr_get_version');
1130      }
1131  
1132      /**
1133       * Returns the solr client instance.
1134       *
1135       * We don't reuse SolrClient if we are on libcurl 7.35.0, due to a bug in that version of curl.
1136       *
1137       * @throws \core_search\engine_exception
1138       * @param bool $triggerexception
1139       * @return \SolrClient
1140       */
1141      protected function get_search_client($triggerexception = true) {
1142  
1143          // Type comparison as it is set to false if not available.
1144          if ($this->client !== null) {
1145              return $this->client;
1146          }
1147  
1148          $options = array(
1149              'hostname' => $this->config->server_hostname,
1150              'path'     => '/solr/' . $this->config->indexname,
1151              'login'    => !empty($this->config->server_username) ? $this->config->server_username : '',
1152              'password' => !empty($this->config->server_password) ? $this->config->server_password : '',
1153              'port'     => !empty($this->config->server_port) ? $this->config->server_port : '',
1154              'secure' => !empty($this->config->secure) ? true : false,
1155              'ssl_cert' => !empty($this->config->ssl_cert) ? $this->config->ssl_cert : '',
1156              'ssl_key' => !empty($this->config->ssl_key) ? $this->config->ssl_key : '',
1157              'ssl_keypassword' => !empty($this->config->ssl_keypassword) ? $this->config->ssl_keypassword : '',
1158              'ssl_cainfo' => !empty($this->config->ssl_cainfo) ? $this->config->ssl_cainfo : '',
1159              'ssl_capath' => !empty($this->config->ssl_capath) ? $this->config->ssl_capath : '',
1160              'timeout' => !empty($this->config->server_timeout) ? $this->config->server_timeout : '30'
1161          );
1162  
1163          if (!class_exists('\SolrClient')) {
1164              throw new \core_search\engine_exception('enginenotinstalled', 'search', '', 'solr');
1165          }
1166  
1167          $client = new \SolrClient($options);
1168  
1169          if ($client === false && $triggerexception) {
1170              throw new \core_search\engine_exception('engineserverstatus', 'search');
1171          }
1172  
1173          if ($this->cacheclient) {
1174              $this->client = $client;
1175          }
1176  
1177          return $client;
1178      }
1179  
1180      /**
1181       * Returns a curl object for conntecting to solr.
1182       *
1183       * @return \curl
1184       */
1185      public function get_curl_object() {
1186          if (!is_null($this->curl)) {
1187              return $this->curl;
1188          }
1189  
1190          $this->curl = new \curl();
1191  
1192          $options = array();
1193          // Build the SSL options. Based on pecl-solr and general testing.
1194          if (!empty($this->config->secure)) {
1195              if (!empty($this->config->ssl_cert)) {
1196                  $options['CURLOPT_SSLCERT'] = $this->config->ssl_cert;
1197                  $options['CURLOPT_SSLCERTTYPE'] = 'PEM';
1198              }
1199  
1200              if (!empty($this->config->ssl_key)) {
1201                  $options['CURLOPT_SSLKEY'] = $this->config->ssl_key;
1202                  $options['CURLOPT_SSLKEYTYPE'] = 'PEM';
1203              }
1204  
1205              if (!empty($this->config->ssl_keypassword)) {
1206                  $options['CURLOPT_KEYPASSWD'] = $this->config->ssl_keypassword;
1207              }
1208  
1209              if (!empty($this->config->ssl_cainfo)) {
1210                  $options['CURLOPT_CAINFO'] = $this->config->ssl_cainfo;
1211              }
1212  
1213              if (!empty($this->config->ssl_capath)) {
1214                  $options['CURLOPT_CAPATH'] = $this->config->ssl_capath;
1215              }
1216          }
1217  
1218          $this->curl->setopt($options);
1219  
1220          if (!empty($this->config->server_username) && !empty($this->config->server_password)) {
1221              $authorization = $this->config->server_username . ':' . $this->config->server_password;
1222              $this->curl->setHeader('Authorization', 'Basic ' . base64_encode($authorization));
1223          }
1224  
1225          return $this->curl;
1226      }
1227  
1228      /**
1229       * Return a Moodle url object for the server connection.
1230       *
1231       * @param string $path The solr path to append.
1232       * @return \moodle_url
1233       */
1234      public function get_connection_url($path) {
1235          // Must use the proper protocol, or SSL will fail.
1236          $protocol = !empty($this->config->secure) ? 'https' : 'http';
1237          $url = $protocol . '://' . rtrim($this->config->server_hostname, '/');
1238          if (!empty($this->config->server_port)) {
1239              $url .= ':' . $this->config->server_port;
1240          }
1241          $url .= '/solr/' . $this->config->indexname . '/' . ltrim($path, '/');
1242  
1243          return new \moodle_url($url);
1244      }
1245  }


Generated: Thu Aug 11 10:00:09 2016 Cross-referenced by PHPXref 0.7.1