search_api-8.x-1.15/src/Plugin/search_api/processor/Tokenizer.php
src/Plugin/search_api/processor/Tokenizer.php
<?php namespace Drupal\search_api\Plugin\search_api\processor; use Drupal\Component\Utility\Unicode; use Drupal\Core\Form\FormStateInterface; use Drupal\Core\Url; use Drupal\search_api\Item\FieldInterface; use Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface; use Drupal\search_api\Processor\FieldsProcessorPluginBase; use Drupal\search_api\Utility\Utility; /** * Splits text into individual words for searching. * * @SearchApiProcessor( * id = "tokenizer", * label = @Translation("Tokenizer"), * description = @Translation("Splits text into individual words for searching."), * stages = { * "pre_index_save" = 0, * "preprocess_index" = -6, * "preprocess_query" = -6 * } * ) */ class Tokenizer extends FieldsProcessorPluginBase { /** * PCRE character class contents identifying spaces in this processor. * * @var string */ protected $spaces; /** * {@inheritdoc} */ public function defaultConfiguration() { $configuration = parent::defaultConfiguration(); $configuration += [ 'spaces' => '', 'overlap_cjk' => TRUE, 'minimum_word_size' => 3, ]; return $configuration; } /** * {@inheritdoc} */ public function setConfiguration(array $configuration) { parent::setConfiguration($configuration); unset($this->spaces); } /** * {@inheritdoc} */ public function buildConfigurationForm(array $form, FormStateInterface $form_state) { $form = parent::buildConfigurationForm($form, $form_state); $args = [ ':pcre-url' => Url::fromUri('https://php.net/manual/regexp.reference.character-classes.php')->toString(), ':doc-url' => Url::fromUri('https://api.drupal.org/api/drupal/core!lib!Drupal!Component!Utility!Unicode.php/constant/Unicode%3A%3APREG_CLASS_WORD_BOUNDARY/8')->toString(), ]; $form['spaces'] = [ '#type' => 'textfield', '#title' => $this->t('Whitespace characters'), '#description' => $this->t('Specify the characters that should be regarded as whitespace and therefore used as word-delimiters. Specify the characters as the inside of a <a href=":pcre-url">PCRE character class</a>. Leave empty to use a <a href=":doc-url">default</a> which should be suitable for most languages with a Latin alphabet.', $args), '#default_value' => $this->configuration['spaces'], ]; $form['overlap_cjk'] = [ '#type' => 'checkbox', '#title' => $this->t('Simple CJK handling'), '#default_value' => $this->configuration['overlap_cjk'], '#description' => $this->t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Does not affect other languages.'), ]; $form['minimum_word_size'] = [ '#type' => 'number', '#title' => $this->t('Minimum word length to index'), '#default_value' => $this->configuration['minimum_word_size'], '#min' => 1, '#max' => 1000, '#description' => $this->t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'), ]; return $form; } /** * {@inheritdoc} */ public function validateConfigurationForm(array &$form, FormStateInterface $form_state) { parent::validateConfigurationForm($form, $form_state); $spaces = str_replace('/', '\/', trim($form_state->getValues()['spaces'])); if ($spaces !== '' && @preg_match('/[' . $spaces . ']+/u', '') === FALSE) { $form_state->setError($form['spaces'], $form['spaces']['#title'] . ': ' . $this->t('The entered text is no valid PCRE character class.')); } } /** * {@inheritdoc} */ protected function testType($type) { return $this->getDataTypeHelper()->isTextType($type); } /** * {@inheritdoc} */ protected function processField(FieldInterface $field) { parent::processField($field); foreach ($field->getValues() as $value) { if ($value instanceof TextValueInterface) { $value->setProperty('tokenized'); } } } /** * Matches all 'N' Unicode character classes (numbers). * * @return string * A string of Unicode characters to use in the regular expression. */ protected function getPregClassNumbers() { return '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}' . '\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}' . '\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}' . '\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-' . '\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}' . '\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}' . '\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}' . '\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-' . '\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}'; } /** * Matches all 'P' Unicode character classes (punctuation). * * @return string * A string of Unicode characters to use in the regular expression. */ protected function getPregClassPunctuation() { return '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}' . '\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}' . '\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}' . '\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}' . '\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}' . '\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}' . '\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}' . '\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}' . '\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-' . '\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}' . '\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}' . '\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}' . '\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}' . '\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-' . '\x{ff65}'; } /** * Matches CJK (Chinese, Japanese, Korean) letter-like characters. * * This list is derived from the "East Asian Scripts" section of * http://www.unicode.org/charts/index.html, as well as a comment on * http://unicode.org/reports/tr11/tr11-11.html listing some character * ranges that are reserved for additional CJK ideographs. * * The character ranges do not include numbers, punctuation, or symbols, since * these are handled separately in search. Note that radicals and strokes are * considered symbols. (See * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt) * * @return string * A string of Unicode characters to use in the regular expression. * * @see search_expand_cjk() */ protected function getPregClassCjk() { return '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' . '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' . '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' . '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' . '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}'; } /** * {@inheritdoc} */ protected function processFieldValue(&$value, $type) { $this->prepare(); $text = $this->simplifyText($value); // Split on spaces. The configured (or default) delimiters have been // replaced by those already in simplifyText(). $arr = explode(' ', $text); $value = []; foreach ($arr as $token) { if (is_numeric($token) || mb_strlen($token) >= $this->configuration['minimum_word_size']) { $value[] = Utility::createTextToken($token); } } } /** * Simplifies a string according to indexing rules. * * @param string $text * The text to simplify. * * @return string * The text with tokens split by single spaces. * * @see search_simplify() */ protected function simplifyText($text) { // Optionally apply simple CJK handling to the text. if ($this->configuration['overlap_cjk']) { $text = preg_replace_callback('/[' . $this->getPregClassCjk() . ']+/u', [$this, 'expandCjk'], $text); } // To improve searching for numerical data such as dates, IP addresses or // version numbers, we consider a group of numerical characters separated // only by punctuation characters to be one piece. This also means, for // example, that searching for "20/03/1984" also returns results with // "20-03-1984" in them. // Readable regular expression: "([number]+)[punctuation]+(?=[number])". $text = preg_replace('/([' . $this->getPregClassNumbers() . ']+)[' . $this->getPregClassPunctuation() . ']+(?=[' . $this->getPregClassNumbers() . '])/u', '\1', $text); // Multiple dot and dash groups are word boundaries and replaced with space. // No need to use the Unicode modifier here because 0-127 ASCII characters // can't match higher UTF-8 characters as the leftmost bit of those are 1. $text = preg_replace('/[.-]{2,}/', ' ', $text); // The dot, underscore and dash are simply removed. This allows meaningful // search behavior with acronyms and URLs. See Unicode note directly above. $text = preg_replace('/[._-]+/', '', $text); // With the exception of the rules above, we consider all punctuation, // marks, spaces, etc, to be a word boundary. $text = preg_replace('/[' . $this->spaces . ']+/u', ' ', $text); return trim($text); } /** * Splits CJK (Chinese, Japanese, Korean) text into tokens. * * Callback for preg_replace_callback() in simplifyText(). * * Normally, searches should match exact words, where a word is defined to be * a sequence of characters delimited by spaces or punctuation. CJK languages * are written in long strings of characters, though, not split up into words. * So in order to allow search matching, we split up CJK text into tokens * consisting of consecutive, overlapping sequences of characters whose length * is equal to the "minimum_word_size" setting. This tokenizing is only done * if the "overlap_cjk" setting is enabled. * * @param array $matches * A PCRE match array, containing the complete match as the only element. * * @return string * Tokenized text, with tokens separated with space characters and starting * and ending with a space. * * @see search_expand_cjk() */ protected function expandCjk(array $matches) { $min = $this->configuration['minimum_word_size']; $str = $matches[0]; $length = mb_strlen($str); // If the text is shorter than the minimum word size, don't tokenize it. if ($length <= $min) { return ' ' . $str . ' '; } $tokens = ' '; // Build a FIFO queue of characters. $chars = []; for ($i = 0; $i < $length; $i++) { // Add the next character off the beginning of the string to the queue. $current = mb_substr($str, 0, 1); $str = substr($str, strlen($current)); $chars[] = $current; if ($i >= $min - 1) { // Make a token of $min characters, and add it to the token string. $tokens .= implode('', $chars) . ' '; // Shift out the first character in the queue. array_shift($chars); } } return $tokens; } /** * {@inheritdoc} */ protected function process(&$value) { // We don't process integers, NULL values or the like. if (is_string($value)) { $this->prepare(); $value = trim($this->simplifyText($value)); $min = $this->configuration['minimum_word_size']; if ($min > 1) { $words = explode(' ', $value); foreach ($words as $i => $word) { if (mb_strlen($word) < $min) { unset($words[$i]); } } $value = implode(' ', $words); } } } /** * Prepares the processor by setting the $spaces property. */ protected function prepare() { if (!isset($this->spaces)) { if ($this->configuration['spaces'] !== '') { $this->spaces = str_replace('/', '\/', $this->configuration['spaces']); } else { $this->spaces = Unicode::PREG_CLASS_WORD_BOUNDARY; } } } }