search_api-8.x-1.15/tests/src/Unit/Processor/TokenizerTest.php
tests/src/Unit/Processor/TokenizerTest.php
<?php namespace Drupal\Tests\search_api\Unit\Processor; use Drupal\search_api\Entity\Index; use Drupal\search_api\Plugin\search_api\parse_mode\Direct; use Drupal\search_api\Plugin\search_api\processor\Tokenizer; use Drupal\search_api\Query\Query; use Drupal\search_api\Utility\Utility; use Drupal\Tests\UnitTestCase; /** * Tests the "Tokenizer" processor. * * @group search_api * * @see \Drupal\search_api\Plugin\search_api\processor\Tokenizer */ class TokenizerTest extends UnitTestCase { use ProcessorTestTrait; /** * {@inheritdoc} */ protected function setUp() { parent::setUp(); $this->processor = new Tokenizer([], 'tokenizer', []); } /** * Tests the processFieldValue() method. * * @param string $passed_value * The field value passed to the processor's processFieldValue() method. * @param string $expected_value * The expected preprocessed value. * @param array $config * (optional) Configuration to override the processor's defaults. * * @dataProvider textDataProvider */ public function testProcessFieldValue($passed_value, $expected_value, array $config = []) { if ($config) { $this->processor->setConfiguration($config); } $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->assertEquals($expected_value, $passed_value); } /** * Provides test data for testValueConfiguration(). * * @return array * Arrays of parameters for testProcessFieldValue(), each containing (in * this order): * - The field value passed to the processor's processFieldValue() method. * - The expected preprocessed value. * - (optional) Configuration to override the processor's defaults. */ public function textDataProvider() { $word_token = Utility::createTextToken('word'); return [ // Test some simple cases. ['word', [$word_token]], ['word word', [$word_token, $word_token]], // Test whether the default splits on special characters, too. ['words!word', [Utility::createTextToken('words'), $word_token]], ['words$word', [Utility::createTextToken('words'), $word_token]], // Test whether overriding the default works and is case-insensitive. [ 'wordXwordxword', [$word_token, Utility::createTextToken('wordxword')], ['spaces' => 'X'], ], [ 'word3word!word', [$word_token, Utility::createTextToken('word!word')], ['spaces' => '\d'], ], [ 'wordXwordRword', [$word_token, $word_token, $word_token], ['spaces' => 'R-Z'], ], [ 'wordXwordRword', [$word_token, $word_token, $word_token], ['spaces' => 'R-TW-Z'], ], [ 'wordXword word', [$word_token, $word_token, $word_token], ['spaces' => 'R-Z'], ], // Test whether minimum word size works. [ 'wordSwo', [$word_token], ['spaces' => 'R-Z'], ], [ 'wordSwo', [$word_token, Utility::createTextToken('wo')], ['spaces' => 'R-Z', 'minimum_word_size' => 2], ], [ 'word w', [$word_token], ['minimum_word_size' => 2], ], [ 'word w', [$word_token, Utility::createTextToken('w')], ['minimum_word_size' => 1], ], [ 'word wordword', [], ['minimum_word_size' => 10], ], [ 'foo-bar', [Utility::createTextToken('foobar')], ], ]; } /** * Tests that the simplifyText() method handles CJK characters properly. * * The simplifyText() method does special things with numbers, symbols and * punctuation. So we only test that CJK characters that are not in these * character classes are tokenized properly. See PREG_CLASS_CJK for more * information. */ public function testCjkSupport() { $this->invokeMethod('prepare'); // Create a string of CJK characters from various character ranges in // the Unicode tables. $starts contains the starts of the character ranges, // $ends the ends. $starts = [ 'CJK unified' => 0x4e00, 'CJK Ext A' => 0x3400, 'CJK Compat' => 0xf900, 'Hangul Jamo' => 0x1100, 'Hangul Ext A' => 0xa960, 'Hangul Ext B' => 0xd7b0, 'Hangul Compat' => 0x3131, 'Half non-punct 1' => 0xff21, 'Half non-punct 2' => 0xff41, 'Half non-punct 3' => 0xff66, 'Hangul Syllables' => 0xac00, 'Hiragana' => 0x3040, 'Katakana' => 0x30a1, 'Katakana Ext' => 0x31f0, 'CJK Reserve 1' => 0x20000, 'CJK Reserve 2' => 0x30000, 'Bomofo' => 0x3100, 'Bomofo Ext' => 0x31a0, 'Lisu' => 0xa4d0, 'Yi' => 0xa000, ]; $ends = [ 'CJK unified' => 0x9fcf, 'CJK Ext A' => 0x4dbf, 'CJK Compat' => 0xfaff, 'Hangul Jamo' => 0x11ff, 'Hangul Ext A' => 0xa97f, 'Hangul Ext B' => 0xd7ff, 'Hangul Compat' => 0x318e, 'Half non-punct 1' => 0xff3a, 'Half non-punct 2' => 0xff5a, 'Half non-punct 3' => 0xffdc, 'Hangul Syllables' => 0xd7af, 'Hiragana' => 0x309f, 'Katakana' => 0x30ff, 'Katakana Ext' => 0x31ff, 'CJK Reserve 1' => 0x2fffd, 'CJK Reserve 2' => 0x3fffd, 'Bomofo' => 0x312f, 'Bomofo Ext' => 0x31b7, 'Lisu' => 0xa4fd, 'Yi' => 0xa48f, ]; // Generate characters consisting of starts, midpoints, and ends. $chars = []; foreach ($starts as $key => $value) { $chars[] = static::codepointToUtf8($starts[$key]); $mid = round(0.5 * ($starts[$key] + $ends[$key])); $chars[] = static::codepointToUtf8($mid); $chars[] = static::codepointToUtf8($ends[$key]); } // Merge into a single string and tokenize. $text = implode('', $chars); $simplified_text = $this->invokeMethod('simplifyText', [$text]); // Prepare the expected return value, which consists of all the 3-grams in // the original string, separated by spaces. $expected = ''; for ($i = 2; $i < count($chars); ++$i) { $expected .= $chars[$i - 2]; $expected .= $chars[$i - 1]; $expected .= $chars[$i]; $expected .= ' '; } $expected = trim($expected); // Verify that the output matches what we expect. $this->assertEquals($expected, $simplified_text, 'CJK tokenizer worked on all supplied CJK characters'); // Verify that disabling the "overlap_cjk" setting works as expected. $this->processor->setConfiguration(['overlap_cjk' => FALSE]); $this->invokeMethod('prepare'); $simplified_text = $this->invokeMethod('simplifyText', [$text]); $this->assertEquals($text, $simplified_text, 'CJK tokenizing is successfully disabled'); } /** * Verifies that strings of non-CJK characters are not tokenized. * * This is just a sanity check – it verifies that strings of letters are * not tokenized. */ public function testNoTokenizer() { // Set the minimum word size to 1 (to split all CJK characters). $this->processor->setConfiguration(['minimum_word_size' => 1]); $this->invokeMethod('prepare'); $letters = 'abcdefghijklmnopqrstuvwxyz'; $out = $this->invokeMethod('simplifyText', [$letters]); $this->assertEquals($letters, $out, 'Latin letters are not CJK tokenized'); } /** * Converts a Unicode code point to a UTF-8 string. * * The PHP function "chr()" only works for ASCII characters up to character * 255. This function converts a number to the corresponding unicode * character. Adapted from functions supplied in comments on several functions * on php.net. * * @param int $num * A Unicode code point. * * @return string * A UTF-8 string containing the character corresponding to that code point. */ protected static function codepointToUtf8($num) { if ($num < 128) { return chr($num); } if ($num < 2048) { return chr(($num >> 6) + 192) . chr(($num & 63) + 128); } if ($num < 65536) { return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); } if ($num < 2097152) { return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); } return ''; } /** * Tests that all Unicode characters simplify correctly. * * This test uses a Drupal core search file that was constructed so that the * even lines are boundary characters, and the odd lines are valid word * characters. (It was generated as a sequence of all the Unicode characters, * and then the boundary characters (punctuation, spaces, etc.) were split * off into their own lines). So the even-numbered lines should simplify to * nothing, and the odd-numbered lines we need to split into shorter chunks * and verify that simplification doesn't lose any characters. * * @see \Drupal\search\Tests\SearchSimplifyTest::testSearchSimplifyUnicode() */ public function testSearchSimplifyUnicode() { // Set the minimum word size to 1 (to split all CJK characters). $this->processor->setConfiguration(['minimum_word_size' => 1]); $this->invokeMethod('prepare'); $input = file_get_contents($this->root . '/core/modules/search/tests/UnicodeTest.txt'); $basestrings = explode(chr(10), $input); $strings = []; foreach ($basestrings as $key => $string) { if ($key % 2) { // Even line, should be removed by simplifyText(). $simplified = $this->invokeMethod('simplifyText', [$string]); $this->assertEquals('', $simplified, "Line $key is excluded from the index"); } else { // Odd line, should be word characters (which might be expanded, but // never removed). Split this into 30-character chunks, so we don't run // into limits of truncation. $start = 0; while ($start < mb_strlen($string)) { $newstr = mb_substr($string, $start, 30); // Special case: leading zeros are removed from numeric strings, // and there's one string in this file that is numbers starting with // zero, so prepend a 1 on that string. if (preg_match('/^[0-9]+$/', $newstr)) { $newstr = '1' . $newstr; } $strings[] = $newstr; $start += 30; } } } foreach ($strings as $key => $string) { $simplified = $this->invokeMethod('simplifyText', [$string]); $this->assertGreaterThanOrEqual(mb_strlen($string), mb_strlen($simplified), "Nothing is removed from string $key."); } // Test the low-numbered ASCII control characters separately. They are not // in the text file because they are problematic for diff, especially \0. $string = ''; for ($i = 0; $i < 32; $i++) { $string .= chr($i); } $this->assertEquals('', $this->invokeMethod('simplifyText', [$string]), 'Text simplification works for ASCII control characters.'); } /** * Tests whether punctuation is treated correctly. * * @param string $passed_value * The string passed to simplifyText(). * @param string $expected_value * The expected return value. * @param string $message * The message to display for the assertion. * * @dataProvider searchSimplifyPunctuationProvider */ public function testSearchSimplifyPunctuation($passed_value, $expected_value, $message) { // Set the minimum word size to 1 (to split all CJK characters). $this->processor->setConfiguration(['minimum_word_size' => 1]); $this->invokeMethod('prepare'); $out = $this->invokeMethod('simplifyText', [$passed_value]); $this->assertEquals($expected_value, $out, $message); } /** * Provides test data for testSearchSimplifyPunctuation(). * * @return array * Arrays of parameters for testSearchSimplifyPunctuation(), each containing * (in this order): * - The string passed to simplifyText(). * - The expected return value. * - The message to display for the assertion. */ public function searchSimplifyPunctuationProvider() { $cases = [ [ '20.03/94-28,876', '20039428876', 'Punctuation removed from numbers', ], [ 'great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries', ], [ 'very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed', ], [ 'regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary', ], [ 'Äußerung français repülőtér', 'Äußerung français repülőtér', 'Umlauts and accented characters are not treated as word boundaries', ], ]; return $cases; } /** * Tests search keywords preprocessing. * * @param string|array $keys * The original keys. * @param string|array $expected * The expected keys after preprocessing. * * @dataProvider preprocessSearchQueryProvider */ public function testPreprocessSearchQuery($keys, $expected) { $index = $this->createMock(Index::class); assert($index instanceof Index); assert($index instanceof \PHPUnit_Framework_MockObject_MockObject); $index->method('status')->willReturn(TRUE); $this->processor->setIndex($index); $query = new Query($index); $query->setParseMode(new Direct([], 'direct', [])); $query->keys($keys); $this->processor->preprocessSearchQuery($query); $this->assertEquals($expected, $query->getKeys()); } /** * Provides test data for testPreprocessSearchQuery(). * * @return array * Arrays of parameters for testPreprocessSearchQuery(), each containing (in * this order): * - The original keys. * - The expected keys after preprocessing. */ public function preprocessSearchQueryProvider() { $cases = [ 'convert whitespace' => [ "foo\tbar\n\nbaz ", 'foo bar baz', ], 'single dash' => [ 'foo-bar', 'foobar', ], 'multiple dashes' => [ 'foo--bar', 'foo bar', ], 'remove short word' => [ 'foo in bar', 'foo bar', ], 'single short word' => [ 'in', '', ], ]; return $cases; } }