敏感词过滤

背景

之前做的聊天项目中涉及敏感词过滤,当时只是采用了简单的字符串匹配,敏感词量还不大,性能还能接受。
最近刚好有时间,于是就研究了trie tree来实现敏感词过滤。

资料

trie tree的原理网上很多。

几种Trie树性能比较

找到了darts的c/c++版

使用

主要涉及几个接口的使用:

  • build : 传入敏感词数组,构建trie tree

  • exactMatchSearch : 查找完全匹配的敏感词

  • commonPrefixSearch : 前缀匹配。找到已字符串开头开始匹配的敏感词,并返回匹配的数组

性能比较

主要是和之前普通的字符串匹配进行性能比较,已10000个敏感词做测试,差距在10000倍以上。理论上性能差距量级 = 敏感词数量。

测试代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// redistest.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//

#include <iostream>

#include <stdint.h>
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include <algorithm>
#include "windows.h"

#include "darts.h"

using namespace Darts;

void generate_valid_keys(std::size_t num_keys,
std::set<std::string>* valid_keys) {
std::vector<char> key;
while (valid_keys->size() < num_keys) {
key.resize(1 + (std::rand() % 8));
for (std::size_t i = 0; i < key.size(); ++i) {
key[i] = 'A' + (std::rand() % 26);
}
valid_keys->insert(std::string(&key[0], key.size()));
}
}

void generate_invalid_keys(std::size_t num_keys,
const std::set<std::string>& valid_keys,
std::set<std::string>* invalid_keys) {
std::vector<char> key;
while (invalid_keys->size() < num_keys) {
key.resize(1 + (std::rand() % 8));
for (std::size_t i = 0; i < key.size(); ++i) {
key[i] = 'A' + (std::rand() % 26);
}
std::string generated_key(&key[0], key.size());
if (valid_keys.find(generated_key) == valid_keys.end())
invalid_keys->insert(std::string(&key[0], key.size()));
}
}


typedef std::vector<bool> Mask;
typedef std::vector<std::string> FilterWords;
typedef std::vector<Mask> MaskArray;

inline std::string to_lower_str(const std::string& s) {
std::string s2;
s2.resize(s.size());
std::transform(s.begin(), s.end(), s2.begin(), ::towlower);
return s2;
}

std::set<std::string> g_valid_keys;
Darts::DoubleArray g_dic;

void ReplaceAndReport(std::string sText)
{
std::string sCopy = to_lower_str(sText);

int nMaskLength = sCopy.length();

Mask textMask(nMaskLength);

FilterWords matchedWords;

MaskArray wordMaskArray;

for (const auto& word : g_valid_keys)
{
size_t pos = 0;
if ((pos = sCopy.find(word, 0)) == std::string::npos) continue;

matchedWords.push_back(word);

size_t nWordLength = word.length();

Mask wordMask(nMaskLength);

do {
std::fill_n(wordMask.begin() + pos, nWordLength, true);
std::fill_n(textMask.begin() + pos, nWordLength, true);
pos += nWordLength;
} while ((pos = sCopy.find(word, pos)) != std::string::npos);

wordMaskArray.push_back(std::move(wordMask));
}

if (matchedWords.empty()) return;

int i = 0;
std::replace_if(sText.begin(), sText.end(), [&i, &textMask](char) { return textMask[i++]; }, '*');

}

void ReplaceAndReportV2(std::string sText)
{
std::string sCopy = to_lower_str(sText);

static const std::size_t MAX_NUM_RESULTS = 16;

for (int i = 0; i < sText.size();)
{
typename Darts::DoubleArray::result_pair_type results[MAX_NUM_RESULTS] = { 0 };

std::size_t num_results = g_dic.commonPrefixSearch(&sText[i], results, MAX_NUM_RESULTS);

if (num_results > 0)
{
int offset = results[(std::min)(num_results, MAX_NUM_RESULTS)-1].length;
std::fill_n(sText.begin() + i, offset, '*');
i += offset;
}
else
{
if (sText[i] < 0)//表示中文
{
i += 2;
}
else
{
i++;
}
}
}

}

int main()
{
static const std::size_t NUM_VALID_KEYS = 1 << 17;
static const std::size_t NUM_INVALID_KEYS = 1 << 17;

generate_valid_keys(NUM_VALID_KEYS, &g_valid_keys);

g_valid_keys.insert("傻逼BCD");
g_valid_keys.insert("傻逼A");
g_valid_keys.insert("傻逼");

std::vector<const char*> keys(g_valid_keys.size());
std::vector<std::size_t> lengths(g_valid_keys.size());
std::vector<typename Darts::DoubleArray::value_type> values(g_valid_keys.size());

std::size_t key_id = 0;
for (std::set<std::string>::const_iterator it = g_valid_keys.begin(); it != g_valid_keys.end(); ++it, ++key_id) {
keys[key_id] = it->c_str();
}

// 文档里面没有说明,敏感词数组需要排序之后才能build,所以需要使用set容器
g_dic.build(keys.size(), &keys[0]);

std::string strTest = "你是个傻逼BCD啊你是个玩";

int testTimes = 1000;
DWORD a1 = GetTickCount();
for (int i = 0; i < testTimes; ++i)
{
ReplaceAndReport(strTest);
}

printf("find count %d \n",GetTickCount() - a1);


/*a1 = GetTickCount();
int result;
for (int i = 0; i < testTimes; ++i)
{
g_dic.exactMatchSearch(strTest.c_str(), result);
}

printf("exactMatchSearch count %d", GetTickCount() - a1);
*/

a1 = GetTickCount();
for (int i = 0; i < testTimes; ++i)
{
ReplaceAndReportV2(strTest);
}

printf("commonPrefixSearch count %d", GetTickCount() - a1);
}