Skip to content

Commit b26f4f3

Browse files
author
燕睿涛
committed
优化
1 parent 83e4015 commit b26f4f3

File tree

2 files changed

+154
-22
lines changed

2 files changed

+154
-22
lines changed

php_pinyin.h

+44-6
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ typedef struct
3939
{
4040
char *complete;
4141
char *simple;
42-
unsigned int tone;
42+
size_t tone;
4343
} py_tone_info;
4444

4545
typedef struct
@@ -48,6 +48,42 @@ typedef struct
4848
char *to;
4949
} py_punctuation_map;
5050

51+
/* 存储一次处理的数据,包括原始拼音,不带音调的拼音,首字母大消息,声调数字 */
52+
typedef struct _py_row_data_list
53+
{
54+
char *ori;
55+
char *none;
56+
char ucfirst;
57+
char lcfirst;
58+
size_t tone;
59+
struct _py_row_data_list *next;
60+
} py_row_data_list;
61+
62+
#define CREATE_ROW_DATA_ITEM(ptr) \
63+
ptr = (py_row_data_list *)py_malloc(sizeof(py_row_data_list), 0); \
64+
ptr->ori = NULL; \
65+
ptr->none = NULL; \
66+
ptr->ucfirst = 0; \
67+
ptr->lcfirst = 0; \
68+
ptr->tone = 0; \
69+
ptr->next = NULL;
70+
71+
#define CHANGE_STR(ret, ori, beginPtr, from, to, j, k) do{ \
72+
for (j=0; j <(beginPtr-ori); j++) { \
73+
ret[j] = ori[j]; \
74+
} \
75+
for (k=0; k<py_strlen(to); k++){ \
76+
ret[j] = to[k]; \
77+
j++; \
78+
} \
79+
beginPtr += py_strlen(from); \
80+
while (*beginPtr) { \
81+
ret[j] = *beginPtr; \
82+
beginPtr++; \
83+
j++; \
84+
} \
85+
ret[j] = 0; \
86+
}while(0)
5187

5288
ZEND_BEGIN_MODULE_GLOBALS(pinyin)
5389
py_data_list *wordList;
@@ -61,21 +97,22 @@ void py_fill_data_list(const char *dir, unsigned int num);
6197
void py_analysis_chinese_tones(const char *line, char *chinese, char *tones);
6298
void str_replace(const char *from, const char *to, char *str, char *ret, zend_bool is_name);
6399
static int php_array_key_compare(const void *a, const void *b);
64-
zval *py_split_sentence(const char *chinese);
100+
py_row_data_list *py_split_sentence(const char *chinese, size_t flag);
101+
void py_destory_row_list(py_row_data_list *list);
65102

66103
#define MAX_READ_WORD_NUM 10
67104
#define true 1
68105
#define false 0
69106
#define PY_TONE_INFO_NUM 28
70107
#define PY_CHAR_TRANS_MAP_NUM 10
71108

72-
//用到的几个常量
109+
/* 转化时候的优化项 */
73110
#define PINYIN_NONE (1<<0)
74111
#define PINYIN_UNICODE (1<<1)
75112
#define PINYIN_ISNAME (1<<2)
76-
#define PINYIN_TRIM (1<<3) //省略标点符号
77-
#define PINYIN_FORMAT_EN (1<<4) //将标点符号转为英文的
78-
#define PINYIN_FORMAT_CH (1<<5) //将表单符号分割为一个
113+
#define PINYIN_ASCII (1<<3)
114+
#define PINYIN_UCFIRST (1<<4)
115+
#define PINYIN_LCFIRST (1<<5)
79116

80117
/* 保存数据的文件名 */
81118
#define FORMAT_WORD_PATH "%swords_%d"
@@ -92,6 +129,7 @@ zval *py_split_sentence(const char *chinese);
92129
#define py_strstr strstr
93130
#define py_malloc(size, persistent) pemalloc(size, persistent)
94131
#define py_strlen strlen
132+
#define py_memcpy memcpy
95133

96134
#define PY_GLOBAL(v) (pinyin_globals.v)
97135

pinyin.c

+110-16
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ void py_fill_data_list(const char *dir, unsigned int num)
175175
/**
176176
*
177177
* @param chinese
178+
* @param flag
178179
* @return
179180
*/
180-
zval *py_split_sentence(const char *sentence)
181+
py_row_data_list *py_split_sentence(const char *sentence, size_t flag)
181182
{
182183
if(PY_GLOBAL(can_access) == false)
183184
{
@@ -188,21 +189,43 @@ zval *py_split_sentence(const char *sentence)
188189
char *chinese = estrdup(sentence);
189190

190191
//正常的拼音化
191-
py_data_list *wordListPtr = PY_GLOBAL(wordList)->next;
192+
py_data_list *wordListPtr;
192193
char *wordPtr = NULL,
193-
*splitItem = NULL;
194+
*splitItem = NULL,
195+
*splitItemPtr = NULL,
196+
tmpStr[100] = {0};
194197
size_t splitLen = 0,
195-
i = 0;
198+
i = 0,
199+
j = 0,
200+
k = 0,
201+
m = 0;
196202
zend_ulong numKey;
197203
#if PHP_MAJOR_VERSION < 7
198204
zval **entry;
199205
#else
200206
zval *entry;
201207
#endif
202208
zval *pinyinPieces = (zval *)py_malloc(sizeof(zval), 0);
203-
zval *pinyinSplit = (zval *)py_malloc(sizeof(zval), 0);
209+
py_row_data_list *rowDataList = (py_row_data_list *)py_malloc(sizeof(py_row_data_list), 0),
210+
*rowDataListPtr = rowDataList,
211+
*rowDataListTmpPtr = NULL;
204212

205213
array_init(pinyinPieces);
214+
215+
/* 替换姓名优先 */
216+
if (flag & PINYIN_ISNAME) {
217+
wordListPtr = PY_GLOBAL(surnameList)->next;
218+
while(wordListPtr != NULL)
219+
{
220+
while (NULL != (wordPtr = py_strstr(chinese, wordListPtr->key))) {
221+
py_add_index_stringl(pinyinPieces, wordPtr-chinese, wordListPtr->val, py_strlen(wordListPtr->val), 1);
222+
memset(wordPtr, CHINESE_SUB_CHAR, py_strlen(wordListPtr->key));
223+
}
224+
wordListPtr = wordListPtr->next;
225+
}
226+
}
227+
228+
wordListPtr = PY_GLOBAL(wordList)->next;
206229
while(wordListPtr != NULL)
207230
{
208231
while (NULL != (wordPtr = py_strstr(chinese, wordListPtr->key))) {
@@ -246,7 +269,6 @@ zval *py_split_sentence(const char *sentence)
246269
}
247270

248271
/* 格式化数组,将汉字切分为单个的一个,去掉制表符 */
249-
array_init(pinyinSplit);
250272
for (i=0; i<=strlen(sentence); i++) {
251273
#if PHP_MAJOR_VERSION < 7
252274
if (zend_hash_index_find(Z_ARRVAL_P(pinyinPieces), i, (void**)&entry) == FAILURE || py_strlen(Z_STRVAL_PP(entry)) <= 0)
@@ -258,10 +280,56 @@ zval *py_split_sentence(const char *sentence)
258280
continue;
259281
splitItem = strtok(Z_STRVAL_P(entry), "\t");
260282
#endif
261-
py_add_next_index_string(pinyinSplit, splitItem, 1);
283+
/* 不需要拼音声调 */
284+
CREATE_ROW_DATA_ITEM(rowDataListTmpPtr);
285+
rowDataListTmpPtr->ori = py_strdup(splitItem, 0);
286+
rowDataListPtr->next = rowDataListTmpPtr;
287+
rowDataListPtr = rowDataListTmpPtr;
288+
if (flag & (PINYIN_NONE|PINYIN_ASCII|PINYIN_LCFIRST|PINYIN_UCFIRST)) {
289+
for(m=0 ; m<PY_TONE_INFO_NUM; m++) {
290+
if (NULL != (wordPtr=py_strstr(splitItem, toneInfos[m].complete))){
291+
CHANGE_STR(tmpStr, splitItem, wordPtr, toneInfos[m].complete, toneInfos[m].simple, j, k);
292+
rowDataListTmpPtr->none = py_strdup(tmpStr, 0);
293+
rowDataListTmpPtr->tone = toneInfos[m].tone;
294+
break;
295+
}
296+
}
297+
}
298+
if (flag & (PINYIN_LCFIRST|PINYIN_UCFIRST)){
299+
if (NULL != rowDataListTmpPtr->none) {
300+
rowDataListTmpPtr->lcfirst = *rowDataListTmpPtr->none;
301+
if (!(rowDataListTmpPtr->lcfirst >= 65 && rowDataListTmpPtr->lcfirst <= 90)
302+
&& !(rowDataListTmpPtr->lcfirst >= 97 && rowDataListTmpPtr->lcfirst <= 122)){
303+
rowDataListTmpPtr->lcfirst = 0;
304+
}
305+
}
306+
}
307+
262308
while((splitItem = strtok(NULL, "\t")))
263309
{
264-
py_add_next_index_string(pinyinSplit, splitItem, 1);
310+
CREATE_ROW_DATA_ITEM(rowDataListTmpPtr);
311+
rowDataListTmpPtr->ori = py_strdup(splitItem, 0);
312+
rowDataListPtr->next = rowDataListTmpPtr;
313+
rowDataListPtr = rowDataListTmpPtr;
314+
if (flag & (PINYIN_NONE|PINYIN_ASCII|PINYIN_LCFIRST|PINYIN_UCFIRST)) {
315+
for(m=0 ; m<PY_TONE_INFO_NUM; m++) {
316+
if (NULL != (wordPtr=py_strstr(splitItem, toneInfos[m].complete))){
317+
CHANGE_STR(tmpStr, splitItem, wordPtr, toneInfos[m].complete, toneInfos[m].simple, j, k);
318+
rowDataListTmpPtr->none = py_strdup(tmpStr, 0);
319+
rowDataListTmpPtr->tone = toneInfos[m].tone;
320+
break;
321+
}
322+
}
323+
}
324+
if (flag & (PINYIN_LCFIRST|PINYIN_UCFIRST)){
325+
if (NULL != rowDataListTmpPtr->none) {
326+
rowDataListTmpPtr->lcfirst = *rowDataListTmpPtr->none;
327+
if (!(rowDataListTmpPtr->lcfirst >= 65 && rowDataListTmpPtr->lcfirst <= 90)
328+
&& !(rowDataListTmpPtr->lcfirst >= 97 && rowDataListTmpPtr->lcfirst <= 122)){
329+
rowDataListTmpPtr->lcfirst = 0;
330+
}
331+
}
332+
}
265333
}
266334
}
267335

@@ -270,7 +338,23 @@ zval *py_split_sentence(const char *sentence)
270338
efree(Z_ARRVAL_P(pinyinPieces));
271339
efree(pinyinPieces);
272340

273-
return pinyinSplit;
341+
return rowDataList;
342+
}
343+
344+
void py_destory_row_list(py_row_data_list *list)
345+
{
346+
py_row_data_list *ptr = list->next,
347+
*tmp = NULL;
348+
while (ptr != NULL) {
349+
if (NULL != ptr->ori)
350+
efree(ptr->ori);
351+
if (NULL != ptr->none)
352+
efree(ptr->none);
353+
tmp = ptr->next;
354+
efree(ptr);
355+
ptr = tmp;
356+
}
357+
efree(list);
274358
}
275359

276360
PHP_INI_BEGIN()
@@ -281,14 +365,24 @@ PHP_FUNCTION(pinyin)
281365
{
282366
char *chinese = NULL;
283367
size_t len;
368+
size_t l = PINYIN_UNICODE;
284369

285-
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &chinese, &len) == FAILURE) {
370+
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &chinese, &len, &l) == FAILURE) {
286371
return;
287372
}
288373

289-
zval *pinyinSplit = py_split_sentence(chinese);
290-
PY_RETURN_ARR(Z_ARRVAL_P(pinyinSplit));
291-
efree(pinyinSplit);
374+
py_row_data_list *list = py_split_sentence(chinese, l),
375+
*rowDataListPtr;
376+
377+
array_init(return_value);
378+
rowDataListPtr = list->next;
379+
while(rowDataListPtr != NULL) {
380+
if (l & PINYIN_UNICODE) {
381+
py_add_next_index_string(return_value, rowDataListPtr->ori, 1);
382+
}
383+
rowDataListPtr = rowDataListPtr->next;
384+
}
385+
py_destory_row_list(list);
292386
}
293387

294388
PHP_MINIT_FUNCTION(pinyin)
@@ -318,9 +412,9 @@ PHP_MINIT_FUNCTION(pinyin)
318412
REGISTER_LONG_CONSTANT("PINYIN_NONE", PINYIN_NONE, CONST_PERSISTENT | CONST_CS);
319413
REGISTER_LONG_CONSTANT("PINYIN_UNICODE", PINYIN_UNICODE, CONST_PERSISTENT | CONST_CS);
320414
REGISTER_LONG_CONSTANT("PINYIN_ISNAME", PINYIN_ISNAME, CONST_PERSISTENT | CONST_CS);
321-
REGISTER_LONG_CONSTANT("PINYIN_TRIM", PINYIN_TRIM, CONST_PERSISTENT | CONST_CS);
322-
REGISTER_LONG_CONSTANT("PINYIN_FORMAT_EN", PINYIN_FORMAT_EN, CONST_PERSISTENT | CONST_CS);
323-
REGISTER_LONG_CONSTANT("PINYIN_FORMAT_CH", PINYIN_FORMAT_CH, CONST_PERSISTENT | CONST_CS);
415+
REGISTER_LONG_CONSTANT("PINYIN_ASCII", PINYIN_ASCII, CONST_PERSISTENT | CONST_CS);
416+
REGISTER_LONG_CONSTANT("PINYIN_UCFIRST", PINYIN_UCFIRST, CONST_PERSISTENT | CONST_CS);
417+
REGISTER_LONG_CONSTANT("PINYIN_LCFIRST", PINYIN_LCFIRST, CONST_PERSISTENT | CONST_CS);
324418

325419
return SUCCESS;
326420
}

0 commit comments

Comments
 (0)