生词分析接口
<p><strong>简要描述:</strong> </p>
<ul>
<li>生词分析接口</li>
</ul>
<p><strong>底层:</strong>
底层调用函数:analyzeArticleData
底层函数路径为:PandoraSearch\SearchAndRecommentdation\SentenceSegmentation\SentTokenization6.py
底层函数返回结果格式:</p>
<pre><code>入参:[{'p':u'这些路段正在施工,请绕行!还有这些信息,一定要看'},{'s':u'秋天养球根花卉,不注意这5点开花难;做好这4点,花开艳丽满屋香'}]
{
'kw_smap': {
u '秋天': [('para2_0', 1.0)],
u '满屋': [('para2_1', 0.23846153846153847)],
u '球根': [('para2_0', 0.8252941176470587)]
},
'kw_dict': {
u '秋天': {
'adj_score': 1.0,
'w': 1.0,
'freq': 1,
'txt': u '秋天',
'type': 'normal',
'nw': 1
},
u '满屋': {
'pkey': u 'c41_Uksjrzfo',
'adj_score': 0.23846153846153847,
'w': 292.0,
'freq': 1,
'txt': u '满屋',
'type': 'normal',
'nw': 0
},
u '球根': {
'pkey': u 'c77_MwyTca2C',
'adj_score': 0.8252941176470587,
'w': 241.0,
'freq': 1,
'txt': u '球根',
'type': 'normal',
'nw': 0
}
},
'sent_info': [{
'id': 'para1_0',
'mr': u '这些||路段||正在||施工||,||请||绕行||!||还有||这些||信息||,||一定||要||看'
}, {
'id': 'para2_0',
'mr': u '秋天||养||球根||花卉||,||不||注意||这||5||点||开花||难'
}, {
'id': 'para2_1',
'mr': u '做好||这||4||点||,||花||开||艳丽||满屋||香'
}],
'articleInfo': {
'ccomp': {
u 'c41': 1,
u 'c77': 1
},
'new_words': [(u '秋天', 1)],
'all_tags': [(u '秋天', 1.0), (u '球根', 0.8252941176470587), (u '满屋', 0.23846153846153847)]
}
}</code></pre>
<p><strong>本接口请求URL:</strong> </p>
<ul>
<li><code>/api/v3/xadmin/objects/new_words_analyze/</code></li>
</ul>
<p><strong>请求方式:</strong></p>
<ul>
<li>post</li>
</ul>
<p><strong>参数:</strong> </p>
<table>
<thead>
<tr>
<th style="text-align: left;">参数名</th>
<th style="text-align: left;">必选</th>
<th style="text-align: left;">类型</th>
<th>说明</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left;">target_text</td>
<td style="text-align: left;">是</td>
<td style="text-align: left;">list</td>
<td>匹配的文本列表,是列表内套字典形式,字典的键名自拟,值为待匹配的文本</td>
</tr>
<tr>
<td style="text-align: left;">only_res</td>
<td style="text-align: left;">是</td>
<td style="text-align: left;">int</td>
<td>1代表 只返回分析结果, 0代表 结果写入数据库</td>
</tr>
<tr>
<td style="text-align: left;">db_host</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>数据库host</td>
</tr>
<tr>
<td style="text-align: left;">db_user</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>数据库用户名</td>
</tr>
<tr>
<td style="text-align: left;">db_pwd</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>数据库密码</td>
</tr>
<tr>
<td style="text-align: left;">db_port</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>数据库端口</td>
</tr>
<tr>
<td style="text-align: left;">db_name</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>数据库的库名</td>
</tr>
<tr>
<td style="text-align: left;">tb_name</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>生词表名</td>
</tr>
<tr>
<td style="text-align: left;">sent_tb_name</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">str</td>
<td>句子表名</td>
</tr>
<tr>
<td style="text-align: left;">min_freq</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">int</td>
<td>最小频度,指定返回文本的最小频度</td>
</tr>
<tr>
<td style="text-align: left;">each_len</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">int</td>
<td>切词字数上限,默认8000</td>
</tr>
<tr>
<td style="text-align: left;">convert_time</td>
<td style="text-align: left;">否</td>
<td style="text-align: left;">bool</td>
<td>是否时间格式转换,默认否False</td>
</tr>
</tbody>
</table>
<p><strong>
注:
当only_res为0时,db_host,db_user,db_pwd,db_port,db_name,tb_name,sent_tb_name全部不允许为空,库必须存在,接口会自动创建两张数据表,一个是生词表,一个是句子表
</strong></p>
<p><strong>入参示例</strong>
入参:
target_text:[{'p':'秋天养球根花卉,不注意这5点开花难;做好这4点,花开艳丽满屋香'},{'s':"圆肩显胖没商量?看看马思纯、蒋欣穿搭术让圆肩不显胖"}]</p>
<p>成功结果:</p>
<pre><code>{
"status": 1,
"message": "success",
"resultObj": {
"sent_info": [
{
"input_key": "p",
"second_cut_sent": "秋天||养||球根||花卉||,||不||注意||这5点开花难||;||做好||这||4||点||,||花||开||艳丽||满屋||香", # 最终切词结果
"input_text": "秋天养球根花卉,不注意这5点开花难;做好这4点,花开艳丽满屋香", # 原文本
"id": "para1_0",
"mr": "秋天||养||球根||花卉||,||不||注意||这||5||点||开花||难||;||做好||这||4||点||,||花||开||艳丽||满屋||香"
},
{
"input_key": "s",
"second_cut_sent": "圆肩||显胖||没商量||?||看看||马思纯||、||蒋欣||穿||搭术||让||圆||肩不显||胖",
"input_text": "圆肩显胖没商量?看看马思纯、蒋欣穿搭术让圆肩不显胖",
"id": "para2_0",
"mr": "圆肩||显胖||没商量||?||看看||马思纯||、||蒋欣||穿||搭术||让||圆||肩不显||胖"
}
],
"words_info": {
",花开艳丽满屋": {
"concept": "",
"adj_score": 0.26548387096774195,
"weight": 1.0,
"text": ",花开艳丽满屋",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": null
},
"蒋欣": {
"concept": "演员", # 主键对应的概念
"adj_score": 0.4456,
"weight": 9768.0, # 权重
"text": "蒋欣", # 文本
"is_new": 0, # 是否新词,1新词,0非新词
"obj_key": "c54_o82DZA3E", # 主键
"alias": "",
"frequency": 1, # 频度
"cname": "",
"type": "normal" # 类型
},
"没商量": {
"concept": "",
"adj_score": 0.8416,
"weight": 1.0,
"text": "没商量",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"艳丽": {
"concept": "成语",
"adj_score": 0.1696774193548387,
"weight": 651.0,
"text": "艳丽",
"is_new": 0,
"obj_key": "c317_oImNS46o",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"这5点开花难": {
"concept": "",
"adj_score": 0.6487096774193548,
"weight": 1.0,
"text": "这5点开花难",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": null
},
"马思纯": {
"concept": "演员",
"adj_score": 0.604,
"weight": 5859.0,
"text": "马思纯",
"is_new": 0,
"obj_key": "c54_X7pFJdak",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"看看": {
"concept": "语言",
"adj_score": 0.6832,
"weight": 526.0,
"text": "看看",
"is_new": 0,
"obj_key": "c70_3NOnG46v",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"秋天": {
"concept": "",
"adj_score": 1.0,
"weight": 1.0,
"text": "秋天",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"满屋": {
"concept": "科学",
"adj_score": 0.10580645161290322,
"weight": 472.0,
"text": "满屋",
"is_new": 0,
"obj_key": "c41_Uksjrzfo",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"显胖": {
"concept": "语言",
"adj_score": 0.9208000000000001,
"weight": 570.0,
"text": "显胖",
"is_new": 0,
"obj_key": "c70_urtztEVx",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"做好": {
"concept": "语言",
"adj_score": 0.42516129032258065,
"weight": 570.0,
"text": "做好",
"is_new": 0,
"obj_key": "c70_xSa1nEVt",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"球根": {
"concept": "植物",
"adj_score": 0.9041935483870968,
"weight": 351.0,
"text": "球根",
"is_new": 0,
"obj_key": "c77_MwyTca2C",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"开花": {
"concept": "语言",
"adj_score": 0.5529032258064516,
"weight": 4635.0,
"text": "开花",
"is_new": 0,
"obj_key": "c70_FmQKmFNP",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"注意": {
"concept": "语言",
"adj_score": 0.7125806451612904,
"weight": 1449.0,
"text": "注意",
"is_new": 0,
"obj_key": "c70_vrvz2EBt",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"点,花开艳丽满屋": {
"concept": "",
"adj_score": 0.2974193548387097,
"weight": 1.0,
"text": "点,花开艳丽满屋",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": null
},
"圆肩": {
"concept": "",
"adj_score": 1.0,
"weight": 100,
"text": "圆肩",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"花卉": {
"concept": "语言",
"adj_score": 0.8403225806451613,
"weight": 9999999.0,
"text": "花卉",
"is_new": 0,
"obj_key": "c70_qItGYd2K",
"alias": "",
"frequency": 1,
"cname": "植物",
"type": "normal"
},
"搭术": {
"concept": "",
"adj_score": 0.3268,
"weight": 100,
"text": "搭术",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
},
"肩不显": {
"concept": "",
"adj_score": 0.1684,
"weight": 100,
"text": "肩不显",
"is_new": 1,
"obj_key": "",
"alias": "",
"frequency": 1,
"cname": "",
"type": "normal"
}
},
"extra_info": "quote_keywords:\n\ntime_keywords:\n\nnumber_keywords:\n5,4\nfiltered_words_msg:\nword:,,freq:2\n,花开艳丽满屋,7,0.5\t,不注意这5点,7,0.5\t,不注意这5,6,0.5\t,花开艳丽,5,0.5\t,不注意这,5,0.5\t,不注意,4,0.5\t,花开,3,0.5\t,不,2,0.5\t,花,2,0.5\nword:点,freq:2\n点,花开艳丽满屋,8,0.5\t点开花难;做好这,8,0.5\t点开花难;做好,7,0.5\t点,花开艳丽,6,0.5\t点开花难,4,0.5\t点,花开,4,0.5\t点开花,3,0.5\t点,花,3,0.5\nword:这,freq:2\n这5点开花难,6,0.5\t这4点,花开,6,0.5\t这5点开花,5,0.5\t这4点,花,5,0.5\t这5点,3,0.5\t这4点,3,0.5\t这5,2,0.5\t这4,2,0.5"
}
}</code></pre>
<p>失败结果如下</p>
<pre><code>{
"status": 2,
"message": {
"target_text": [
"target_text内的元素必须是字典"
]
},
"resultObj": {}
}</code></pre>
<pre><code>{
"status": 2,
"message": {
"tb_name": [
"tb_name can not be null"
],
"sent_tb_name": [
"sent_tb_name can not be null"
],
"db_user": [
"db_user can not be null"
],
"db_pwd": [
"db_pwd can not be null"
],
"db_port": [
"db_port can not be null"
],
"db_name": [
"db_name can not be null"
],
"db_host": [
"db_host can not be null"
]
},
"resultObj": {}
}</code></pre>
<ul>
<li>写入生词表中的格式</li>
</ul>
<table>
<thead>
<tr>
<th style="text-align: left;">字段</th>
<th style="text-align: left;">类型</th>
<th style="text-align: left;">是否可为空</th>
<th>索引</th>
<th>注释</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left;">id</td>
<td style="text-align: left;">int</td>
<td style="text-align: left;">否</td>
<td>是</td>
<td>自增id</td>
</tr>
<tr>
<td style="text-align: left;">match_text</td>
<td style="text-align: left;">varchar(255)</td>
<td style="text-align: left;">否</td>
<td>否</td>
<td>文本</td>
</tr>
<tr>
<td style="text-align: left;">obj_key</td>
<td style="text-align: left;">varchar(50)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>主键</td>
</tr>
<tr>
<td style="text-align: left;">frequency</td>
<td style="text-align: left;">int</td>
<td style="text-align: left;">是</td>
<td>是</td>
<td>频度</td>
</tr>
<tr>
<td style="text-align: left;">is_new</td>
<td style="text-align: left;">int</td>
<td style="text-align: left;">是</td>
<td>是</td>
<td>是否新词,0为非新词,1为新词</td>
</tr>
<tr>
<td style="text-align: left;">type</td>
<td style="text-align: left;">varchar(50)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>类型</td>
</tr>
<tr>
<td style="text-align: left;">adj_score</td>
<td style="text-align: left;">float</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>分值</td>
</tr>
<tr>
<td style="text-align: left;">weight</td>
<td style="text-align: left;">float</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>权重</td>
</tr>
<tr>
<td style="text-align: left;">concept</td>
<td style="text-align: left;">varchar(50)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>主键对应的概念</td>
</tr>
<tr>
<td style="text-align: left;">alias</td>
<td style="text-align: left;">varchar(255)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td></td>
</tr>
<tr>
<td style="text-align: left;">cname</td>
<td style="text-align: left;">varchar(50)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td></td>
</tr>
</tbody>
</table>
<ul>
<li>写入句子表中的格式</li>
</ul>
<table>
<thead>
<tr>
<th style="text-align: left;">字段</th>
<th style="text-align: left;">类型</th>
<th style="text-align: left;">是否可为空</th>
<th>索引</th>
<th>注释</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left;">id</td>
<td style="text-align: left;">int</td>
<td style="text-align: left;">否</td>
<td>是</td>
<td>自增id</td>
</tr>
<tr>
<td style="text-align: left;">input_key</td>
<td style="text-align: left;">varchar(255)</td>
<td style="text-align: left;">是</td>
<td>是</td>
<td>输入的键</td>
</tr>
<tr>
<td style="text-align: left;">sent</td>
<td style="text-align: left;">longtext</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>输入的文本</td>
</tr>
<tr>
<td style="text-align: left;">cut_sent</td>
<td style="text-align: left;">longtext</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>切分后的文本</td>
</tr>
<tr>
<td style="text-align: left;">cut_words</td>
<td style="text-align: left;">longtext</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>二次切分后的文本</td>
</tr>
<tr>
<td style="text-align: left;">para_id</td>
<td style="text-align: left;">varchar(255)</td>
<td style="text-align: left;">是</td>
<td>否</td>
<td>文本在列表中的位置</td>
</tr>
</tbody>
</table>
<p><strong>返回参数说明</strong> </p>
<table>
<thead>
<tr>
<th style="text-align: left;">参数名</th>
<th style="text-align: left;">类型</th>
<th>说明</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left;"></td>
<td style="text-align: left;"></td>
<td></td>
</tr>
</tbody>
</table>
<ul>
<li>备注:无</li>
</ul>