search_api-8.x-1.15/src/Plugin/search_api/processor/Stemmer.php
src/Plugin/search_api/processor/Stemmer.php
<?php namespace Drupal\search_api\Plugin\search_api\processor; use Drupal\Core\Form\FormStateInterface; use Drupal\search_api\Plugin\search_api\processor\Resources\Porter2; use Drupal\search_api\Processor\FieldsProcessorPluginBase; use Drupal\search_api\Query\QueryInterface; /** * Stems search terms. * * @SearchApiProcessor( * id = "stemmer", * label = @Translation("Stemmer"), * description = @Translation("Stems search terms (for example, <em>talking</em> to <em>talk</em>). Currently, this only acts on English language content. It uses the Porter 2 stemmer algorithm (<a href=""https://wikipedia.org/wiki/Stemming"">More information</a>). For best results, use after tokenizing."), * stages = { * "pre_index_save" = 0, * "preprocess_index" = 0, * "preprocess_query" = 0, * } * ) */ class Stemmer extends FieldsProcessorPluginBase { /** * Static cache for already-generated stems. * * @var string[] */ protected $stems = []; /** * {@inheritdoc} */ public function defaultConfiguration() { $configuration = parent::defaultConfiguration(); $configuration += [ 'exceptions' => [ 'texan' => 'texa', 'mexican' => 'mexic', ], ]; return $configuration; } /** * {@inheritdoc} */ public function buildConfigurationForm(array $form, FormStateInterface $form_state) { $form = parent::buildConfigurationForm($form, $form_state); $description = $this->t('If the <a href="http://snowball.tartarus.org/algorithms/english/stemmer.html">algorithm</a> does not stem words in your dataset in the desired way, you can enter specific exceptions in the form of WORD=STEM, where "WORD" is the original word in the text and "STEM" is the resulting stem. List each exception on a separate line.'); // Convert the keyed array into a config format (word=stem) $default_value = http_build_query($this->configuration['exceptions'], NULL, "\n"); $form['exceptions'] = [ '#type' => 'textarea', '#title' => $this->t('Exceptions'), '#description' => $description, '#default_value' => $default_value, ]; return $form; } /** * {@inheritdoc} */ public function validateConfigurationForm(array &$form, FormStateInterface $form_state) { parent::validateConfigurationForm($form, $form_state); $exceptions = $form_state->getValue('exceptions'); if (($parsed = parse_ini_string($exceptions)) === FALSE) { $el = $form['exceptions']; $form_state->setError($el, $el['#title'] . ': ' . $this->t('The entered text is not in valid WORD=STEM format.')); } else { $form_state->setValue('exceptions', $parsed); } } /** * {@inheritdoc} */ public function preprocessIndexItems(array $items) { foreach ($items as $item) { // Limit this processor to English language data. if ($item->getLanguage() !== 'en') { continue; } foreach ($item->getFields() as $name => $field) { if ($this->testField($name, $field)) { $this->processField($field); } } } } /** * {@inheritdoc} */ public function preprocessSearchQuery(QueryInterface $query) { // Only process queries that can (also) return English language content. $languages = $query->getLanguages(); if ($languages && !in_array('en', $languages)) { return; } parent::preprocessSearchQuery($query); } /** * {@inheritdoc} */ protected function testType($type) { return $this->getDataTypeHelper()->isTextType($type); } /** * {@inheritdoc} */ protected function process(&$value) { // In the absence of the tokenizer processor, this ensures split words. $words = preg_split('/[^\p{L}\p{N}]+/u', strip_tags($value), -1, PREG_SPLIT_NO_EMPTY); $stemmed = []; foreach ($words as $i => $word) { // To optimize processing, store processed stems in a static array. if (!isset($this->stems[$word])) { $stem = new Porter2($word, $this->configuration['exceptions']); $this->stems[$word] = $stem->stem(); } $stemmed[] = $this->stems[$word]; } $value = implode(' ', $stemmed); } }