search_api-8.x-1.15/tests/src/Unit/Processor/TokenizerTest.php

tests/src/Unit/Processor/TokenizerTest.php
<?php

namespace Drupal\Tests\search_api\Unit\Processor;

use Drupal\search_api\Entity\Index;
use Drupal\search_api\Plugin\search_api\parse_mode\Direct;
use Drupal\search_api\Plugin\search_api\processor\Tokenizer;
use Drupal\search_api\Query\Query;
use Drupal\search_api\Utility\Utility;
use Drupal\Tests\UnitTestCase;

/**
 * Tests the "Tokenizer" processor.
 *
 * @group search_api
 *
 * @see \Drupal\search_api\Plugin\search_api\processor\Tokenizer
 */
class TokenizerTest extends UnitTestCase {

  use ProcessorTestTrait;

  /**
   * {@inheritdoc}
   */
  protected function setUp() {
    parent::setUp();

    $this->processor = new Tokenizer([], 'tokenizer', []);
  }

  /**
   * Tests the processFieldValue() method.
   *
   * @param string $passed_value
   *   The field value passed to the processor's processFieldValue() method.
   * @param string $expected_value
   *   The expected preprocessed value.
   * @param array $config
   *   (optional) Configuration to override the processor's defaults.
   *
   * @dataProvider textDataProvider
   */
  public function testProcessFieldValue($passed_value, $expected_value, array $config = []) {
    if ($config) {
      $this->processor->setConfiguration($config);
    }
    $type = 'text';
    $this->invokeMethod('processFieldValue', [&$passed_value, $type]);
    $this->assertEquals($expected_value, $passed_value);
  }

  /**
   * Provides test data for testValueConfiguration().
   *
   * @return array
   *   Arrays of parameters for testProcessFieldValue(), each containing (in
   *   this order):
   *   - The field value passed to the processor's processFieldValue() method.
   *   - The expected preprocessed value.
   *   - (optional) Configuration to override the processor's defaults.
   */
  public function textDataProvider() {
    $word_token = Utility::createTextToken('word');
    return [
      // Test some simple cases.
      ['word', [$word_token]],
      ['word word', [$word_token, $word_token]],
      // Test whether the default splits on special characters, too.
      ['words!word', [Utility::createTextToken('words'), $word_token]],
      ['words$word', [Utility::createTextToken('words'), $word_token]],
      // Test whether overriding the default works and is case-insensitive.
      [
        'wordXwordxword',
        [$word_token, Utility::createTextToken('wordxword')],
        ['spaces' => 'X'],
      ],
      [
        'word3word!word',
        [$word_token, Utility::createTextToken('word!word')],
        ['spaces' => '\d'],
      ],
      [
        'wordXwordRword',
        [$word_token, $word_token, $word_token],
        ['spaces' => 'R-Z'],
      ],
      [
        'wordXwordRword',
        [$word_token, $word_token, $word_token],
        ['spaces' => 'R-TW-Z'],
      ],
      [
        'wordXword word',
        [$word_token, $word_token, $word_token],
        ['spaces' => 'R-Z'],
      ],
      // Test whether minimum word size works.
      [
        'wordSwo',
        [$word_token],
        ['spaces' => 'R-Z'],
      ],
      [
        'wordSwo',
        [$word_token, Utility::createTextToken('wo')],
        ['spaces' => 'R-Z', 'minimum_word_size' => 2],
      ],
      [
        'word w',
        [$word_token],
        ['minimum_word_size' => 2],
      ],
      [
        'word w',
        [$word_token, Utility::createTextToken('w')],
        ['minimum_word_size' => 1],
      ],
      [
        'word wordword',
        [],
        ['minimum_word_size' => 10],
      ],
      [
        'foo-bar',
        [Utility::createTextToken('foobar')],
      ],
    ];
  }

  /**
   * Tests that the simplifyText() method handles CJK characters properly.
   *
   * The simplifyText() method does special things with numbers, symbols and
   * punctuation. So we only test that CJK characters that are not in these
   * character classes are tokenized properly. See PREG_CLASS_CJK for more
   * information.
   */
  public function testCjkSupport() {
    $this->invokeMethod('prepare');

    // Create a string of CJK characters from various character ranges in
    // the Unicode tables. $starts contains the starts of the character ranges,
    // $ends the ends.
    $starts = [
      'CJK unified' => 0x4e00,
      'CJK Ext A' => 0x3400,
      'CJK Compat' => 0xf900,
      'Hangul Jamo' => 0x1100,
      'Hangul Ext A' => 0xa960,
      'Hangul Ext B' => 0xd7b0,
      'Hangul Compat' => 0x3131,
      'Half non-punct 1' => 0xff21,
      'Half non-punct 2' => 0xff41,
      'Half non-punct 3' => 0xff66,
      'Hangul Syllables' => 0xac00,
      'Hiragana' => 0x3040,
      'Katakana' => 0x30a1,
      'Katakana Ext' => 0x31f0,
      'CJK Reserve 1' => 0x20000,
      'CJK Reserve 2' => 0x30000,
      'Bomofo' => 0x3100,
      'Bomofo Ext' => 0x31a0,
      'Lisu' => 0xa4d0,
      'Yi' => 0xa000,
    ];
    $ends = [
      'CJK unified' => 0x9fcf,
      'CJK Ext A' => 0x4dbf,
      'CJK Compat' => 0xfaff,
      'Hangul Jamo' => 0x11ff,
      'Hangul Ext A' => 0xa97f,
      'Hangul Ext B' => 0xd7ff,
      'Hangul Compat' => 0x318e,
      'Half non-punct 1' => 0xff3a,
      'Half non-punct 2' => 0xff5a,
      'Half non-punct 3' => 0xffdc,
      'Hangul Syllables' => 0xd7af,
      'Hiragana' => 0x309f,
      'Katakana' => 0x30ff,
      'Katakana Ext' => 0x31ff,
      'CJK Reserve 1' => 0x2fffd,
      'CJK Reserve 2' => 0x3fffd,
      'Bomofo' => 0x312f,
      'Bomofo Ext' => 0x31b7,
      'Lisu' => 0xa4fd,
      'Yi' => 0xa48f,
    ];

    // Generate characters consisting of starts, midpoints, and ends.
    $chars = [];
    foreach ($starts as $key => $value) {
      $chars[] = static::codepointToUtf8($starts[$key]);
      $mid = round(0.5 * ($starts[$key] + $ends[$key]));
      $chars[] = static::codepointToUtf8($mid);
      $chars[] = static::codepointToUtf8($ends[$key]);
    }

    // Merge into a single string and tokenize.
    $text = implode('', $chars);
    $simplified_text = $this->invokeMethod('simplifyText', [$text]);

    // Prepare the expected return value, which consists of all the 3-grams in
    // the original string, separated by spaces.
    $expected = '';
    for ($i = 2; $i < count($chars); ++$i) {
      $expected .= $chars[$i - 2];
      $expected .= $chars[$i - 1];
      $expected .= $chars[$i];
      $expected .= ' ';
    }
    $expected = trim($expected);

    // Verify that the output matches what we expect.
    $this->assertEquals($expected, $simplified_text, 'CJK tokenizer worked on all supplied CJK characters');

    // Verify that disabling the "overlap_cjk" setting works as expected.
    $this->processor->setConfiguration(['overlap_cjk' => FALSE]);
    $this->invokeMethod('prepare');
    $simplified_text = $this->invokeMethod('simplifyText', [$text]);
    $this->assertEquals($text, $simplified_text, 'CJK tokenizing is successfully disabled');
  }

  /**
   * Verifies that strings of non-CJK characters are not tokenized.
   *
   * This is just a sanity check – it verifies that strings of letters are
   * not tokenized.
   */
  public function testNoTokenizer() {
    // Set the minimum word size to 1 (to split all CJK characters).
    $this->processor->setConfiguration(['minimum_word_size' => 1]);
    $this->invokeMethod('prepare');

    $letters = 'abcdefghijklmnopqrstuvwxyz';
    $out = $this->invokeMethod('simplifyText', [$letters]);

    $this->assertEquals($letters, $out, 'Latin letters are not CJK tokenized');
  }

  /**
   * Converts a Unicode code point to a UTF-8 string.
   *
   * The PHP function "chr()" only works for ASCII characters up to character
   * 255. This function converts a number to the corresponding unicode
   * character. Adapted from functions supplied in comments on several functions
   * on php.net.
   *
   * @param int $num
   *   A Unicode code point.
   *
   * @return string
   *   A UTF-8 string containing the character corresponding to that code point.
   */
  protected static function codepointToUtf8($num) {
    if ($num < 128) {
      return chr($num);
    }

    if ($num < 2048) {
      return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
    }

    if ($num < 65536) {
      return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
    }

    if ($num < 2097152) {
      return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
    }

    return '';
  }

  /**
   * Tests that all Unicode characters simplify correctly.
   *
   * This test uses a Drupal core search file that was constructed so that the
   * even lines are boundary characters, and the odd lines are valid word
   * characters. (It was generated as a sequence of all the Unicode characters,
   * and then the boundary characters (punctuation, spaces, etc.) were split
   * off into their own lines).  So the even-numbered lines should simplify to
   * nothing, and the odd-numbered lines we need to split into shorter chunks
   * and verify that simplification doesn't lose any characters.
   *
   * @see \Drupal\search\Tests\SearchSimplifyTest::testSearchSimplifyUnicode()
   */
  public function testSearchSimplifyUnicode() {
    // Set the minimum word size to 1 (to split all CJK characters).
    $this->processor->setConfiguration(['minimum_word_size' => 1]);
    $this->invokeMethod('prepare');

    $input = file_get_contents($this->root . '/core/modules/search/tests/UnicodeTest.txt');
    $basestrings = explode(chr(10), $input);
    $strings = [];
    foreach ($basestrings as $key => $string) {
      if ($key % 2) {
        // Even line, should be removed by simplifyText().
        $simplified = $this->invokeMethod('simplifyText', [$string]);
        $this->assertEquals('', $simplified, "Line $key is excluded from the index");
      }
      else {
        // Odd line, should be word characters (which might be expanded, but
        // never removed). Split this into 30-character chunks, so we don't run
        // into limits of truncation.
        $start = 0;
        while ($start < mb_strlen($string)) {
          $newstr = mb_substr($string, $start, 30);
          // Special case: leading zeros are removed from numeric strings,
          // and there's one string in this file that is numbers starting with
          // zero, so prepend a 1 on that string.
          if (preg_match('/^[0-9]+$/', $newstr)) {
            $newstr = '1' . $newstr;
          }
          $strings[] = $newstr;
          $start += 30;
        }
      }
    }
    foreach ($strings as $key => $string) {
      $simplified = $this->invokeMethod('simplifyText', [$string]);
      $this->assertGreaterThanOrEqual(mb_strlen($string), mb_strlen($simplified), "Nothing is removed from string $key.");
    }

    // Test the low-numbered ASCII control characters separately. They are not
    // in the text file because they are problematic for diff, especially \0.
    $string = '';
    for ($i = 0; $i < 32; $i++) {
      $string .= chr($i);
    }
    $this->assertEquals('', $this->invokeMethod('simplifyText', [$string]), 'Text simplification works for ASCII control characters.');
  }

  /**
   * Tests whether punctuation is treated correctly.
   *
   * @param string $passed_value
   *   The string passed to simplifyText().
   * @param string $expected_value
   *   The expected return value.
   * @param string $message
   *   The message to display for the assertion.
   *
   * @dataProvider searchSimplifyPunctuationProvider
   */
  public function testSearchSimplifyPunctuation($passed_value, $expected_value, $message) {
    // Set the minimum word size to 1 (to split all CJK characters).
    $this->processor->setConfiguration(['minimum_word_size' => 1]);
    $this->invokeMethod('prepare');

    $out = $this->invokeMethod('simplifyText', [$passed_value]);
    $this->assertEquals($expected_value, $out, $message);
  }

  /**
   * Provides test data for testSearchSimplifyPunctuation().
   *
   * @return array
   *   Arrays of parameters for testSearchSimplifyPunctuation(), each containing
   *   (in this order):
   *   - The string passed to simplifyText().
   *   - The expected return value.
   *   - The message to display for the assertion.
   */
  public function searchSimplifyPunctuationProvider() {
    $cases = [
      [
        '20.03/94-28,876',
        '20039428876',
        'Punctuation removed from numbers',
      ],
      [
        'great...drupal--module',
        'great drupal module',
        'Multiple dot and dashes are word boundaries',
      ],
      [
        'very_great-drupal.module',
        'verygreatdrupalmodule',
        'Single dot, dash, underscore are removed',
      ],
      [
        'regular,punctuation;word',
        'regular punctuation word',
        'Punctuation is a word boundary',
      ],
      [
        'Äußerung français repülőtér',
        'Äußerung français repülőtér',
        'Umlauts and accented characters are not treated as word boundaries',
      ],
    ];
    return $cases;
  }

  /**
   * Tests search keywords preprocessing.
   *
   * @param string|array $keys
   *   The original keys.
   * @param string|array $expected
   *   The expected keys after preprocessing.
   *
   * @dataProvider preprocessSearchQueryProvider
   */
  public function testPreprocessSearchQuery($keys, $expected) {
    $index = $this->createMock(Index::class);
    assert($index instanceof Index);
    assert($index instanceof \PHPUnit_Framework_MockObject_MockObject);
    $index->method('status')->willReturn(TRUE);
    $this->processor->setIndex($index);

    $query = new Query($index);
    $query->setParseMode(new Direct([], 'direct', []));
    $query->keys($keys);

    $this->processor->preprocessSearchQuery($query);
    $this->assertEquals($expected, $query->getKeys());
  }

  /**
   * Provides test data for testPreprocessSearchQuery().
   *
   * @return array
   *   Arrays of parameters for testPreprocessSearchQuery(), each containing (in
   *   this order):
   *   - The original keys.
   *   - The expected keys after preprocessing.
   */
  public function preprocessSearchQueryProvider() {
    $cases = [
      'convert whitespace' => [
        "foo\tbar\n\nbaz ",
        'foo bar baz',
      ],
      'single dash' => [
        'foo-bar',
        'foobar',
      ],
      'multiple dashes' => [
        'foo--bar',
        'foo bar',
      ],
      'remove short word' => [
        'foo in bar',
        'foo bar',
      ],
      'single short word' => [
        'in',
        '',
      ],
    ];
    return $cases;
  }

}
Главная | Обратная связь
You are here

search_api-8.x-1.15/tests/src/Unit/Processor/TokenizerTest.php