taxonomy_overview-1.0.1/src/TagsOverviewTermNormalizer.php
src/TagsOverviewTermNormalizer.php
<?php
namespace Drupal\taxonomy_overview;
use Wamania\Snowball\StemmerFactory;
/**
* Provides utilities for normalizing and grouping taxonomy terms.
*
* This class uses stemming and string similarity to normalize terms
* and cluster them into groups of similar words. It helps in detecting
* and organizing duplicate or near-duplicate taxonomy terms.
*/
class TagsOverviewTermNormalizer {
/**
* The stemmer instance used to reduce words to their root form.
*
* @var \Wamania\Snowball\Stemmer
*/
protected $stemmer;
/**
* Constructs a TagsOverviewTermNormalizer object.
*
* @param string $language
* The language code used to initialize the stemmer (default: 'en').
*/
public function __construct($language = 'en') {
$this->stemmer = StemmerFactory::create($language);
}
/**
* Normalizes a term into a canonical string.
*
* - Converts the term to lowercase.
* - Splits it into tokens (words).
* - Applies stemming to reduce each token to its root form.
* - Sorts the stems alphabetically to make order irrelevant.
* - Joins them back into a single normalized string.
*
* @param string $term
* The original taxonomy term label.
*
* @return string
* The normalized representation of the term.
*/
public function normalize($term) {
$tokens = preg_split('/\s+/', strtolower($term));
$stems = array_map(fn($token) => $this->stemmer->stem($token), $tokens);
sort($stems);
return implode(' ', $stems);
}
/**
* Groups a set of taxonomy terms by similarity.
*
* Process:
* - Normalizes each term.
* - Compares normalized values using Levenshtein distance.
* - Groups terms if their normalized form is within a small
* edit distance (<= 2), meaning they are likely variations.
* - Creates a new group if no existing group matches.
*
* @param array $terms
* An associative array of taxonomy terms, keyed by term ID (tid).
*
* @return array
* A nested array where each group key is the normalized base string
* and its value is a list of term IDs with their original labels.
*/
public function groupSimilarTerms(array $terms) {
$normalized = [];
$groups = [];
foreach ($terms as $tid => $original) {
$key = $this->normalize($original);
$normalized[$tid] = $key;
}
foreach ($normalized as $tid => $base) {
$matched = FALSE;
foreach ($groups as $groupKey => &$group) {
if (levenshtein($groupKey, $base) <= 2) {
$group[$tid] = $terms[$tid];
$matched = TRUE;
break;
}
}
if (!$matched) {
$groups[$base] = [$tid => $terms[$tid]];
}
}
return $groups;
}
}
