Trie(前缀树)是一种高效的字符串数据结构,广泛用于实现字典和搜索引擎中的自动补全功能。本文将详细探讨三种不同的Trie实现方法:基于数据索引字符映射的Trie、基于哈希表的Trie和基于二叉搜索树(BST)的Trie,并比较它们的优缺点和性能。
数据索引字符映射Trie实现
代码实现
public class TrieSet {
private static final int R = 128; // ASCII字符集大小
private Node root; // Trie的根节点
private static class Node {
private boolean isKey; // 是否是键(即一个完整的单词)
private DataIndexedCharMap<Node> next; // 子节点映射
private Node(boolean isKey, int R) {
this.isKey = isKey;
this.next = new DataIndexedCharMap<>(R);
}
}
public TrieSet() {
root = new Node(false, R);
}
public void add(String key) {
root = add(root, key, 0);
}
private Node add(Node x, String key, int d) {
if (x == null) {
x = new Node(false, R);
}
if (d == key.length()) {
x.isKey = true;
return x;
}
char c = key.charAt(d);
x.next.put(c, add(x.next.get(c), key, d + 1));
return x;
}
public boolean contains(String key) {
Node x = get(root, key, 0);
return x != null && x.isKey;
}
private Node get(Node x, String key, int d) {
if (x == null) return null;
if (d == key.length()) return x;
char c = key.charAt(d);
return get(x.next.get(c), key, d + 1);
}
public List<String> collect() {
List<String> result = new ArrayList<>();
collect(root, new StringBuilder(), result);
return result;
}
private void collect(Node x, StringBuilder prefix, List<String> result) {
if (x == null) return;
if (x.isKey) result.add(prefix.toString());
for (Character c : x.next.keys()) {
prefix.append(c);
collect(x.next.get(c), prefix, result);
prefix.deleteCharAt(prefix.length() - 1);
}
}
public List<String> keysWithPrefix(String prefix) {
List<String> result = new ArrayList<>();
Node x = get(root, prefix, 0);
if (x != null) {
collect(x, new StringBuilder(prefix), result);
}
return result;
}
public String longestPrefixOf(String query) {
int length = searchLongestPrefix(root, query, 0, 0);
return query.substring(0, length);
}
private int searchLongestPrefix(Node x, String query, int d, int length) {
if (x == null) return length;
if (x.isKey) length = d;
if (d == query.length()) return length;
char c = query.charAt(d);
return searchLongestPrefix(x.next.get(c), query, d + 1, length);
}
public static class DataIndexedCharMap<V> {
private V[] items;
public DataIndexedCharMap(int R) {
items = (V[]) new Object[R];
}
public void put(char c, V val) {
items[c] = val;
}
public V get(char c) {
return items[c];
}
public boolean contains(char c) {
return items[c] != null;
}
public Iterable<Character> keys() {
List<Character> list = new ArrayList<>();
for (int i = 0; i < items.length; i++) {
if (items[i] != null) {
list.add((char) i);
}
}
return list;
}
}
}
分析
优点:
- 常数时间的查找和插入(Θ(1))。
缺点:
- 每个节点都有固定大小的数组(128个链接),这会浪费大量空间,特别是在节点数量多但密度低的情况下。
基于哈希表的Trie实现
代码实现
public class TrieSet {
private Node root; // Trie的根节点
private static class Node {
private boolean isKey; // 是否是键(即一个完整的单词)
private ImprovedHashMap<Character, Node> next; // 子节点映射
private Node(boolean isKey) {
this.isKey = isKey;
this.next = new ImprovedHashMap<>();
}
}
public TrieSet() {
root = new Node(false);
}
public void add(String key) {
root = add(root, key, 0);
}
private Node add(Node x, String key, int d) {
if (x == null) {
x = new Node(false);
}
if (d == key.length()) {
x.isKey = true;
return x;
}
char c = key.charAt(d);
x.next.put(c, add(x.next.get(c), key, d + 1));
return x;
}
public boolean contains(String key) {
Node x = get(root, key, 0);
return x != null && x.isKey;
}
private Node get(Node x, String key, int d) {
if (x == null) return null;
if (d == key.length()) return x;
char c = key.charAt(d);
return get(x.next.get(c), key, d + 1);
}
public List<String> collect() {
List<String> result = new ArrayList<>();
collect(root, new StringBuilder(), result);
return result;
}
private void collect(Node x, StringBuilder prefix, List<String> result) {
if (x == null) return;
if (x.isKey) result.add(prefix.toString());
for (Character c : x.next.keySet()) {
prefix.append(c);
collect(x.next.get(c), prefix, result);
prefix.deleteCharAt(prefix.length() - 1);
}
}
public List<String> keysWithPrefix(String prefix) {
List<String> result = new ArrayList<>();
Node x = get(root, prefix, 0);
if (x != null) {
collect(x, new StringBuilder(prefix), result);
}
return result;
}
public String longestPrefixOf(String query) {
int length = searchLongestPrefix(root, query, 0, 0);
return query.substring(0, length);
}
private int searchLongestPrefix(Node x, String query, int d, int length) {
if (x == null) return length;
if (x.isKey) length = d;
if (d == query.length()) return length;
char c = query.charAt(d);
return searchLongestPrefix(x.next.get(c), query, d + 1, length);
}
public static class ImprovedHashMap<K, V> {
private Entry<K, V>[] table; // 存储哈希表的数组
private int size; // 哈希表中元素的数量
private static final float LOAD_FACTOR_THRESHOLD = 0.75f; // 负载因子阈值
public ImprovedHashMap() {
table = new Entry[101]; // 初始化哈希表
size = 0;
}
public void put(K key, V value) {
if (loadFactor() >= LOAD_FACTOR_THRESHOLD) {
resize(); // 负载因子超过阈值时进行扩容
}
int index = key.hashCode() % table.length;
while (table[index] != null) {
if (table[index].key.equals(key)) {
table[index].value = value;
return;
}
index = (index + 1) % table.length;
}
table[index] = new Entry<>(key, value);
size++;
}
public V get(K key) {
int index = key.hashCode() % table.length;
while (table[index] != null) {
if (table[index].key.equals(key)) {
return table[index].value;
}
index = (index + 1) % table.length;
}
return null;
}
public boolean containsKey(K key) {
return get(key) != null;
}
public Iterable<K> keySet() {
List<K> keys = new ArrayList<>();
for (Entry<K, V> entry : table) {
if (entry != null) {
keys.add(entry.key);
}
}
return keys;
}
private float loadFactor() {
return (float) size / table.length;
}
private void resize() {
Entry<K, V>[] oldTable = table;
table = new Entry[oldTable.length * 2];
size = 0;
for (Entry<K, V> entry : oldTable) {
if (entry != null) {
put(entry.key, entry.value);
}
}
}
private static class Entry<K, V> {
K key;
V value;
Entry(K key, V value) {
this.key = key;
this.value = value;
}
}
}
}
分析
优点:
- 动态分配空间,根据需要进行扩容,节省空间。
缺点:
- 查找和插入时间为O®,其中R是字母表的大小。
基于二叉搜索树的Trie实现
代码实现
public class TrieSet {
private Node root; // Trie的根节点
private static class Node {
private boolean isKey; // 是否是键(即一个完整的单词)
private BSTMap<Character, Node> next; // 子节点映射
private Node(boolean isKey) {
this.isKey = isKey;
this.next = new BSTMap<>();
}
}
public TrieSet() {
root = new Node(false);
}
public void add(String key) {
root = add(root, key, 0);
}
private Node add(Node x, String key, int d) {
if (x == null) {
x = new Node(false);
}
if (d == key.length()) {
x.isKey = true;
return x;
}
char c = key.charAt(d);
x.next.put(c, add(x.next.get(c), key, d + 1));
return x;
}
public boolean contains(String key) {
Node x = get(root, key, 0);
return x != null && x.isKey;
}
private Node get(Node x, String key, int d) {
if (x == null) return null;
if (d == key.length()) return x;
char c = key.charAt(d);
return get(x.next.get(c), key, d + 1);
}
public List<String> collect() {
List<String> result = new ArrayList<>();
collect(root, new StringBuilder(), result);
return result;
}
private void collect(Node x, StringBuilder prefix, List<String> result) {
if (x == null) return;
if (x.isKey) result.add(prefix.toString());
for (Character c : x.next.keySet()) {
prefix.append(c);
collect(x.next.get(c), prefix, result);
prefix.deleteCharAt(prefix.length() - 1);
}
}
public List<String> keysWithPrefix(String prefix) {
List<String> result = new ArrayList<>();
Node x = get(root, prefix, 0);
if (x != null) {
collect(x, new StringBuilder(prefix), result);
}
return result;
}
public String longestPrefixOf(String query) {
int length = searchLongestPrefix(root, query, 0, 0);
return query.substring(0, length);
}
private int searchLongestPrefix(Node x, String query, int d, int length) {
if (x == null) return length;
if (x.isKey) length = d;
if (d == query.length()) return length;
char c = query.charAt(d);
return searchLongestPrefix(x.next.get(c), query, d + 1, length);
}
public static class BSTMap<K extends Comparable<K>, V> {
private Node root; // BST的根节点
private class Node {
private K key;
private V val;
private Node left, right;
public Node(K key, V val) {
this.key = key;
this.val = val;
}
}
public void put(K key, V val) {
root = put(root, key, val);
}
private Node put(Node x, K key, V val) {
if (x == null) return new Node(key, val);
int cmp = key.compareTo(x.key);
if (cmp < 0) x.left = put(x.left, key, val);
else if (cmp > 0) x.right = put(x.right, key, val);
else x.val = val;
return x;
}
public V get(K key) {
return get(root, key);
}
private V get(Node x, K key) {
if (x == null) return null;
int cmp = key.compareTo(x.key);
if (cmp < 0) return get(x.left, key);
else if (cmp > 0) return get(x.right, key);
else return x.val;
}
public boolean containsKey(K key) {
return get(key) != null;
}
public Iterable<K> keySet() {
List<K> keys = new ArrayList<>();
inorder(root, keys);
return keys;
}
private void inorder(Node x, List<K> keys) {
if (x == null) return;
inorder(x.left, keys);
keys.add(x.key);
inorder(x.right, keys);
}
}
}
分析
优点:
- 根据需要创建子节点,节省空间。
- 查找和插入时间为O(log R),最坏情况下为O®。
缺点:
- 查找和插入时间受字母表大小和树的平衡性影响。
差别
1. 数据索引字符映射Trie
结构特点
- 每个节点包含一个固定大小的数组,用来存储子节点。
- 数组的大小通常是字符集的大小(如128个ASCII字符)。
代码实现
private static class Node {
private boolean isKey;
private DataIndexedCharMap<Node> next;
private Node(boolean isKey, int R) {
this.isKey = isKey;
this.next = new DataIndexedCharMap<>(R);
}
}
public static class DataIndexedCharMap<V> {
private V[] items;
public DataIndexedCharMap(int R) {
items = (V[]) new Object[R];
}
public void put(char c, V val) {
items[c] = val;
}
public V get(char c) {
return items[c];
}
public Iterable<Character> keys() {
List<Character> list = new ArrayList<>();
for (int i = 0; i < items.length; i++) {
if (items[i] != null) {
list.add((char) i);
}
}
return list;
}
}
优缺点
优点:
- 查找和插入时间为常数时间Θ(1)。
缺点:
- 空间浪费严重。每个节点都有一个固定大小的数组,不管实际使用了多少个子节点。
2. 基于哈希表的Trie
结构特点
- 每个节点使用哈希表来存储子节点。
- 哈希表根据需要动态调整大小,不会像固定大小的数组那样浪费空间。
代码实现
private static class Node {
private boolean isKey;
private ImprovedHashMap<Character, Node> next;
private Node(boolean isKey) {
this.isKey = isKey;
this.next = new ImprovedHashMap<>();
}
}
public static class ImprovedHashMap<K, V> {
private Entry<K, V>[] table;
private int size;
private static final float LOAD_FACTOR_THRESHOLD = 0.75f;
public ImprovedHashMap() {
table = new Entry[101];
size = 0;
}
public void put(K key, V value) {
if (loadFactor() >= LOAD_FACTOR_THRESHOLD) {
resize();
}
int index = key.hashCode() % table.length;
while (table[index] != null) {
if (table[index].key.equals(key)) {
table[index].value = value;
return;
}
index = (index + 1) % table.length;
}
table[index] = new Entry<>(key, value);
size++;
}
public V get(K key) {
int index = key.hashCode() % table.length;
while (table[index] != null) {
if (table[index].key.equals(key)) {
return table[index].value;
}
index = (index + 1) % table.length;
}
return null;
}
private float loadFactor() {
return (float) size / table.length;
}
private void resize() {
Entry<K, V>[] oldTable = table;
table = new Entry[oldTable.length * 2];
size = 0;
for (Entry<K, V> entry : oldTable) {
if (entry != null) {
put(entry.key, entry.value);
}
}
}
private static class Entry<K, V> {
K key;
V value;
Entry(K key, V value) {
this.key = key;
this.value = value;
}
}
}
优缺点
优点:
- 节省空间。只在需要时创建子节点。
缺点:
- 查找和插入时间为O®,其中R是字母表的大小,受哈希冲突影响。
3. 基于二叉搜索树的Trie
结构特点
- 每个节点使用二叉搜索树来存储子节点。
- 子节点按照字符顺序排列,可以减少空间浪费。
代码实现
private static class Node {
private boolean isKey;
private BSTMap<Character, Node> next;
private Node(boolean isKey) {
this.isKey = isKey;
this.next = new BSTMap<>();
}
}
public static class BSTMap<K extends Comparable<K>, V> {
private Node root;
private class Node {
private K key;
private V val;
private Node left, right;
public Node(K key, V val) {
this.key = key;
this.val = val;
}
}
public void put(K key, V val) {
root = put(root, key, val);
}
private Node put(Node x, K key, V val) {
if (x == null) return new Node(key, val);
int cmp = key.compareTo(x.key);
if (cmp < 0) x.left = put(x.left, key, val);
else if (cmp > 0) x.right = put(x.right, key, val);
else x.val = val;
return x;
}
public V get(K key) {
return get(root, key);
}
private V get(Node x, K key) {
if (x == null) return null;
int cmp = key.compareTo(x.key);
if (cmp < 0) return get(x.left, key);
else if (cmp > 0) return get(x.right, key);
else return x.val;
}
public Iterable<K> keySet() {
List<K> keys = new ArrayList<>();
inorder(root, keys);
return keys;
}
private void inorder(Node x, List<K> keys) {
if (x == null) return;
inorder(x.left, keys);
keys.add(x.key);
inorder(x.right, keys);
}
}
优缺点
优点:
- 根据需要创建子节点,节省空间。
- 查找和插入时间为O(log R),最坏情况下为O®,取决于BST的平衡性。
缺点:
- 查找和插入时间受字母表大小和树的平衡性影响。
结论
Trie是一种高效的字符串数据结构,适用于前缀匹配等操作。在实际应用中,根据具体需求选择合适的实现方法非常重要:
- 数据索引字符映射Trie:适用于字符集较小且节点密度高的情况。
- 基于哈希表的Trie:适用于字符集较大且节点分布稀疏的情况。
- 基于二叉搜索树的Trie:在需要节省空间且能接受略高的查找和插入时间的情况下效果较好。
在实际应用中,前缀匹配等操作是Trie结构的主要优势,这使得它在许多搜索和自动补全系统中占有一席之地。选择合适的Trie实现方法,将有效提升系统的性能和空间利用率。