-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
DuNinglin [杜宁林]
committed
Apr 7, 2021
1 parent
0721ed7
commit 1dc1a8e
Showing
256 changed files
with
52,613 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
language: scala | ||
jdk: openjdk8 | ||
scala: | ||
- 2.11.12 | ||
|
||
# TODO 将CI触发限定在duckling-fork-chinese目录中 | ||
|
||
script: | ||
# 加载sbt-launch有可能失败 | ||
- mkdir -p /home/travis/.sbt/launchers/1.5.0 | ||
- wget https://repo1.maven.org/maven2/org/scala-sbt/sbt-launch/1.5.0/sbt-launch-1.5.0.jar -O /home/travis/.sbt/launchers/1.5.0/sbt-launch.jar | ||
- cd duckling-fork-chinese | ||
- sbt clean coverage test | ||
- sbt coverageAggregate | ||
after_success: | ||
- bash <(curl -s https://codecov.io/bash) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
.DS_Store | ||
|
||
# Intellij | ||
.idea | ||
*.iml | ||
|
||
# sbt | ||
project/project | ||
project/target | ||
target | ||
|
||
# bloop | ||
.bloop | ||
out | ||
.bsp | ||
|
||
duckling.log | ||
/release/ | ||
/server/naive_bayes.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Copyright (c) 2020, Xiaomi and/or its affiliates. All rights reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
BSD License | ||
|
||
For Duckling software | ||
|
||
Copyright (c) 2016-present, Facebook, Inc. All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without modification, | ||
are permitted provided that the following conditions are met: | ||
|
||
* Redistributions of source code must retain the above copyright notice, this | ||
list of conditions and the following disclaimer. | ||
|
||
* Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
* Neither the name Facebook nor the names of its contributors may be used to | ||
endorse or promote products derived from this software without specific | ||
prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR | ||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON | ||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
# duckling-fork-chinese | ||
[data:image/s3,"s3://crabby-images/783d9/783d952c4492414ae4aefb9df8f0f0d32aba8c69" alt="Build"](https://travis-ci.com/XiaoMi/MiNLP.svg?branch=master) [data:image/s3,"s3://crabby-images/a27ad/a27ad483c2bf39ee9d94885e2cf969ccac6f34fe" alt="codecov"](https://codecov.io/gh/XiaoMi/MiNLP) | ||
|
||
[facebook/duckling](https://github.com/facebook/duckling)项目的scala复刻版,本项目录目前只服务于中文解析,暂时只提供中文文档。 | ||
|
||
> 每一种支持的解析类型,比如时间、数字等,被称为一个dimension。 | ||
[支持的Dimension](./doc/支持的Dimension.md) | ||
|
||
想更多了解duckling是什么,与同类工具的比较可以参考[Duckling/Introduction](https://duckling.wit.ai/#introduction)。 | ||
|
||
目前文档稀少,需要补充什么,欢迎在Issue区提。 | ||
|
||
## 1. 调用 | ||
|
||
在使用之前,适当进行繁转简、全角转半角,可以减轻匹配压力。 | ||
|
||
返回结果的结构需要提前了解定义,暂不在此中列出,目前需要自行查看代码。 | ||
|
||
## 1.1 Scala | ||
|
||
``` | ||
sbt - "duckling" % "duckling-core_2.11" %% VERSION | ||
``` | ||
|
||
```scala | ||
import duckling.Api | ||
import duckling.dimension.place.{Place, PlaceData} | ||
|
||
val answers = Api.analyze(ns, new Context(), Options(targets = Set(Place))) | ||
val candidates = answers.flatMap { answer => | ||
// answer.token.value是一个ResolvedValue,ResolvedValue是所有结果的共同trait,需要强转/匹配至实现 | ||
answer.token.value match { | ||
case placeData: PlaceData => | ||
placeData.candidates.map(c => (c.getPathStr(), answer.token)) | ||
case _ => Nil | ||
} | ||
} | ||
``` | ||
|
||
## 1.2 Java | ||
|
||
做了方便Java使用的analyzeJ方法,如果觉得使用不够顺滑可以在issue区提出难受的点。 | ||
|
||
``` | ||
gradle - duckling:duckling-core_2.11:VERSION | ||
``` | ||
|
||
```java | ||
import com.google.common.collect.Sets; | ||
import duckling.Api; | ||
import duckling.Types.Context; | ||
import duckling.Types.Options; | ||
import duckling.dimension.EnumeratedDimension; | ||
|
||
List<EnumeratedDimension> dims = Lists.newArrayList(EnumeratedDimension.Time, EnumeratedDimension.Duration); | ||
final Types.Context context = new Types.Context(ZonedDateTime.now(), Locale.CHINA); | ||
final Types.Options options = new Types.Options(dims, false); | ||
|
||
Context context = new Context(LocalDateTime.now(), Locale.CHINA); | ||
String query = ...; | ||
List<Answer> answers = Api.analyzeJ(query, context, options); | ||
|
||
answers.stream().map(answer -> { | ||
answer.token().range(); // 文本区间 [x, y) | ||
ResolvedValue value = answer.token().value(); | ||
if(value instanceof TimeValue) { | ||
TimeValue tv = (TimeValue) value; | ||
tv.holiday(); // 如果是从清明节识别来的,这里会标记"清明" | ||
// 时间点 | ||
if(tv.timeValue() instanceof SimpleValue) { | ||
InstantValue t = ((SimpleValue) tv.timeValue()).instant(); | ||
t.datetime(); // LocalDateTime 时间点 | ||
t.grain(); // 粒度 | ||
} else if(tv.timeValue() instanceof IntervalValue) { | ||
// 左闭右开 | ||
IntervalValue interval = (IntervalValue) tv.timeValue(); | ||
interval.start().datetime(); | ||
interval.end().datetime(); | ||
} | ||
} | ||
// ... | ||
}); | ||
``` | ||
|
||
### 1.3 API&WEB | ||
|
||
项目从[duckling.wit.ai](https://duckling.wit.ai/)抽取了部分样式,仿了一版可视化解析的代码(无意冒犯,侵删)。 | ||
|
||
**测试** | ||
|
||
```bash | ||
sbt | ||
> project server | ||
> runMain duckling.WebServer | ||
``` | ||
|
||
**打包** | ||
|
||
```bash | ||
# (可选)训练模型,如果不提供naive_bayes.json,会在启动时现场训练 | ||
sbt "project server; runMain duckling.ranking.NaiveBayesRank naive_bayes.json" | ||
cp server/naive_bayes.json server/src/main/resources/ | ||
|
||
# 打包 | ||
sbt server/stage | ||
|
||
cd server/target/universal/stage | ||
bash bin/duckling-server | ||
``` | ||
|
||
|
||
|
||
在线体验(暂无,需要自行打包):`/duckling?dim=...&query=...` | ||
|
||
[晚上八点提醒我跑十分钟二十公里](http://localhost:8181/duckling?dim=numeral,time,duration&query=%E6%99%9A%E4%B8%8A%E5%85%AB%E7%82%B9%E6%8F%90%E9%86%92%E6%88%91%E8%B7%91%E5%8D%81%E5%88%86%E9%92%9F%E4%BA%8C%E5%8D%81%E5%85%AC%E9%87%8C) | ||
|
||
data:image/s3,"s3://crabby-images/98048/98048a59ce619e8f53f4ff5bf9052436a2ee22b2" alt="示例Query" | ||
|
||
|
||
URL中有dim参数,多个dim用,隔开,dim需要在URL中手工输入,了解前端的同学可以帮忙贡献一个页面交互(现在的开发对这些一窍不通)。 | ||
|
||
API调用:`/api?dim=...&query=...` | ||
|
||
## 2. Style Guide | ||
|
||
Intellij IDEA的设置中打开`Editor -> Code Style -> Scala`,点击Scheme右侧的小按钮,`Import Scheme…`,选择本项目下的`intellij_formating.xml`。 | ||
|
||
## 3. 发布 | ||
|
||
只会发布core项目,其它的会跳过: | ||
|
||
```scala | ||
sbt release | ||
``` | ||
|
||
## 4. 如何新增一个Dimension | ||
|
||
duckling-fork-chinese 对每一类模式词的抽取,创建了一类Dimension,如:Numeral, Time, PhoneNumber, ...等等 | ||
需要新增一类模式词时请注意以下要求: | ||
|
||
- 在core/src/main/scala/duckling/dimension路径下新增该如何新增一个Dimension的目录 | ||
- 在FullDimensions和EnumeratedDimension中引入新定义的Dimension | ||
- 在新目录下给出该功能的测试Examples | ||
- 在编写该Dimension的解析规则时尽可能复用现有的Dimension | ||
|
||
## 5. 如何参与 | ||
|
||
参考[How to Contribute](./doc/CONTRIBUTING.md) | ||
|
||
## 6. License | ||
|
||
duckling-fork-chinese Apache 2.0 | ||
|
||
项目是在用Scala重写facebook/duckling的基础上开展的,另附上duckling的原BSD License |
79 changes: 79 additions & 0 deletions
79
duckling-fork-chinese/benchmark/src/main/java/duckling/benchmark/DateTimeBenchmark.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
* Copyright (c) 2020, Xiaomi and/or its affiliates. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package duckling.benchmark; | ||
|
||
|
||
import com.google.common.collect.Sets; | ||
import com.google.common.io.Resources; | ||
import duckling.Api; | ||
import duckling.Types; | ||
import duckling.Types.Context; | ||
import org.openjdk.jmh.annotations.*; | ||
import org.openjdk.jmh.runner.Runner; | ||
import org.openjdk.jmh.runner.RunnerException; | ||
import org.openjdk.jmh.runner.options.Options; | ||
import org.openjdk.jmh.runner.options.OptionsBuilder; | ||
|
||
import java.io.IOException; | ||
import java.net.URL; | ||
import java.nio.charset.StandardCharsets; | ||
import java.time.LocalDateTime; | ||
import java.time.ZonedDateTime; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Random; | ||
import java.util.concurrent.TimeUnit; | ||
|
||
@BenchmarkMode(Mode.SingleShotTime) | ||
@OutputTimeUnit(TimeUnit.MILLISECONDS) | ||
@Warmup(iterations = 1, time = 20) | ||
@Measurement(iterations = 100000) | ||
@Fork(1) | ||
@State(Scope.Benchmark) | ||
public class DateTimeBenchmark { | ||
Context context = new Context(ZonedDateTime.of(LocalDateTime.of(2016, 12, 8, 11, 30, 30), Types.ZoneCN()), Locale.CHINA); | ||
final Types.Options option = new Types.Options(Sets.newHashSet("time", "duration"), false); | ||
final Random rand = new Random(); | ||
|
||
|
||
private List<String> queries; | ||
|
||
@Setup | ||
public void setup() throws Exception { | ||
URL url = Resources.getResource(DateTimeBenchmark.class, "/time.txt"); | ||
queries = Resources.readLines(url, StandardCharsets.UTF_8); | ||
System.out.println(String.format("read %s examples", queries.size())); | ||
} | ||
|
||
public String query() { | ||
int n = rand.nextInt(queries.size()); | ||
return queries.get(n); | ||
} | ||
|
||
@Benchmark | ||
public void duckling() { | ||
Api.analyzeJ(query(), context, option); | ||
} | ||
|
||
public static void main(String[] args) throws RunnerException, IOException { | ||
Options opt = new OptionsBuilder() | ||
.include(".*" + DateTimeBenchmark.class.getSimpleName() + ".*") | ||
.build(); | ||
|
||
new Runner(opt).run(); | ||
} | ||
} |
Oops, something went wrong.