Skip to content

Commit

Permalink
[DUCK] Lite模块的依赖精简重构 (#142)
Browse files Browse the repository at this point in the history
* 拆分test模块,在Lite上去掉了common-io/json4s/commons-lang3/commons-text/emoji依赖
* 测试之前先生成模型
  • Loading branch information
du00cs authored Dec 16, 2022
1 parent ae39dc8 commit 7dfd37c
Show file tree
Hide file tree
Showing 15 changed files with 111 additions and 73 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/duckling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ jobs:
- name: Run tests
run: |
cd duckling-fork-chinese
sbt +test
sbt duckModel
sbt +test
sbt coverage
sbt coverageAggregate
bash <(curl -s https://codecov.io/bash) -r du00cs/MiNLP -t 'd2de025e-e5b7-4115-a98e-07e6fc3d7001'
1 change: 1 addition & 0 deletions duckling-fork-chinese/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ out
duckling.log
/release/
/server/naive_bayes.json
core/src/main/resources/naive_bayes.kryo
16 changes: 12 additions & 4 deletions duckling-fork-chinese/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,20 @@ publish / skip := true // don't publish the root project

lazy val `duckling-fork-chinese` = project.in(file("."))
.settings(sharedSettings)
.aggregate(core, lite, server, benchmark)
.aggregate(core, lite, test, server, benchmark)

lazy val core = project
.settings(
name := "duckling-core",
sharedSettings,
libraryDependencies ++= coreDependencies
).dependsOn(test % "test->test")

lazy val test = project
.settings(
name := "duckling-test",
sharedSettings,
libraryDependencies ++= testDependencies
)

lazy val lite = project
Expand All @@ -66,8 +73,9 @@ lazy val lite = project
sharedSettings,
libraryDependencies ++= Seq(
logback % Provided // logging
)
).dependsOn(core)
),
excludeDependencies ++= liteExcludes
).dependsOn(core % "compile->compile", test % "test->test")

lazy val server = project
.settings(
Expand All @@ -79,7 +87,7 @@ lazy val server = project
Universal / javaOptions ++= Seq("-J-Xmx1g"),
scriptClasspath := Seq("../conf", "*")
)
.dependsOn(core)
.dependsOn(core, test % "test->test")
.enablePlugins(JavaServerAppPackaging)

lazy val benchmark = project
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,25 @@ import java.time.format.DateTimeFormatter

import org.json4s.{CustomSerializer, DefaultFormats, FieldSerializer, Formats, NoTypeHints, Serializer, TypeHints}
import org.json4s.JsonAST.JString
import org.json4s.jackson.Serialization.write

import com.typesafe.scalalogging.LazyLogging

import com.xiaomi.duckling.dimension.time._
import com.xiaomi.duckling.Types.{Entity, ResolvedVal, Token}
import com.xiaomi.duckling.Types.{Entity, ResolvedToken, ResolvedVal, ResolvedValue, Token}
import com.xiaomi.duckling.dimension.numeral.NumeralValue
import com.xiaomi.duckling.dimension.quantity.QuantityValue
import com.xiaomi.duckling.dimension.quantity.{QuantityData, QuantityValue}
import com.xiaomi.duckling.dimension.time.Types.{DuckDateTime, InstantValue}
import com.xiaomi.duckling.dimension.time.duration.DurationData
import com.xiaomi.duckling.dimension.time.enums.{Grain, IntervalDirection, IntervalType}
import com.xiaomi.duckling.dimension.Dimension
import com.xiaomi.duckling.dimension.ordinal.OrdinalData
import com.xiaomi.duckling.dimension.place.PlaceData
import com.xiaomi.duckling.dimension.time.predicates.SeriesPredicate
import com.xiaomi.duckling.ranking.Testing
import com.xiaomi.duckling.types.Node

object JsonSerde {
object JsonSerde extends LazyLogging {

private val node = FieldSerializer[Node]({
case ("production" | "features", _) => None
Expand Down Expand Up @@ -91,6 +95,27 @@ object JsonSerde {
case ("ge", _) => None
})

/**
* 有一些字段构造起来比较困难,或者不用比较,可以忽略掉
*/
val sTimeValue = FieldSerializer[TimeValue]({
case ("values", _) => None
case ("simple", _) => None
})

val sNumeralValue = FieldSerializer[NumeralValue]({
case ("precision", _) => None
})

val sPlaceData = FieldSerializer[PlaceData]({
case ("texts", _) => None
case ("level", _) => None
})

val sQuantityValue = FieldSerializer[QuantityData]({
case ("isLatent", _) => None
})

/**
* json4s未发布的代码 [[https://github.com/json4s/json4s/blob/master/ext/src/main/scala/org/json4s/ext/JavaEnumSerializer.scala]]
*/
Expand Down Expand Up @@ -122,7 +147,11 @@ object JsonSerde {
quantityValue +
durationData +
placeData +
ordinalData
ordinalData +
sTimeValue +
sNumeralValue +
sPlaceData +
sQuantityValue

object DuckFormats extends DefaultFormats {
override val typeHintFieldName: String = "class"
Expand All @@ -134,4 +163,17 @@ object JsonSerde {
new JavaEnumNameSerializer[IntervalDirection]()
)
}

def simpleCheck(doc: Document, resolvedToken: ResolvedToken, value: ResolvedValue): Boolean = {
val expected = write(value)
val actual = write(resolvedToken.value)
val equals = expected == actual
if (!equals && Testing.testOptions.debug) {
logger.debug(s"checking: ${doc.rawInput}")
logger.debug(s"expected ${if (expected == actual) "=" else "!="} actual")
logger.debug(s"expected: $expected")
logger.debug(s"actual : $actual")
}
equals
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import scala.collection.JavaConverters._
import scala.io.{Codec, Source}

import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.commons.io.FileUtils

import com.typesafe.scalalogging.LazyLogging

Expand All @@ -41,9 +42,9 @@ object Resources extends LazyLogging {
} else {
val input = url(path).openStream()
try {
val lines = IOUtils.readLines(input, StandardCharsets.UTF_8)
if (enableCache) Files.write(file, lines)
lines.asScala.toList
val lines = Source.fromInputStream(input)(Codec.UTF8).getLines().toList
if (enableCache) Files.write(file, lines.asJava)
lines
} finally {
input.close()
}
Expand All @@ -53,7 +54,7 @@ object Resources extends LazyLogging {
def readLines(resource: String): List[String] = {
val input = tryResource(resource)
try {
IOUtils.readLines(input, StandardCharsets.UTF_8).asScala.toList
Source.fromInputStream(input)(Codec.UTF8).getLines().toList
} finally {
input.close()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
package com.xiaomi.duckling.ranking

import com.typesafe.scalalogging.LazyLogging

import java.time.Duration

import scala.collection.mutable

import com.xiaomi.duckling.Types._
Expand All @@ -27,6 +27,7 @@ import com.xiaomi.duckling.ranking.Bayes.Classifier
import com.xiaomi.duckling.ranking.Testing.{Corpus, Example}
import com.xiaomi.duckling.ranking.Types._
import com.xiaomi.duckling.types.Node
import com.xiaomi.duckling.JsonSerde

object NaiveBayesLearning extends LazyLogging {
type Classifiers = Map[String, Classifier]
Expand Down Expand Up @@ -94,9 +95,9 @@ object NaiveBayesLearning extends LazyLogging {
def makeDataset1(rules: List[Rule],
context: Context,
options: Options)(dataset: Dataset, example: Example): Dataset = {
val (doc, predicate, _) = example
val (doc, rv) = example
val tokens = parseAndResolve(rules, doc, context, options)
val (ok, ko) = tokens.partition(predicate(doc, context))
val (ok, ko) = tokens.partition(JsonSerde.simpleCheck(doc, _, rv))

val nodesOK: Set[Node] = nodes(ok)
val nodesKO = nodes(ko).diff(nodesOK)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ object NaiveBayesRank extends LazyLogging {
Resources.inputStream(path)(in => KryoSerde.loadSerializedResource(in, classOf[Classifiers]))
} catch {
case t: Throwable =>
logger.warn("load model failed, now training from corpus", t)
logger.warn(s"load model failed, now training from corpus, reason: ${t.getMessage}")
makeClassifiers(
rules,
namedCorpus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,33 +30,8 @@ import com.xiaomi.duckling.dimension.time.TimeValue
import com.xiaomi.duckling.{Document, JsonSerde}

object Testing extends LazyLogging {
type TestPredicate = (Document, Context) => ResolvedToken => Boolean
type Example = (Document, TestPredicate, Int)
type Example = (Document, ResolvedValue)
type Corpus = (Context, Options, List[Example])
type NegativeCorpus = (Context, Options, List[String])

/**
* 有一些字段构造起来比较困难,或者不用比较,可以忽略掉
*/
val sTimeValue = FieldSerializer[TimeValue]({
case ("values", _) => None
case ("simple", _) => None
})

val sNumeralValue = FieldSerializer[NumeralValue]({
case ("precision", _) => None
})

val sPlaceData = FieldSerializer[PlaceData]({
case ("texts", _) => None
case ("level", _) => None
})

val sQuantityValue = FieldSerializer[QuantityData]({
case ("isLatent", _) => None
})

implicit val formats = JsonSerde.formats + sTimeValue + sNumeralValue + sPlaceData + sQuantityValue

val testContext: Context =
Context(
Expand All @@ -68,23 +43,7 @@ object Testing extends LazyLogging {

def examples(output: ResolvedValue,
texts: List[String],
weight: Int = 1,
enableAnalyzer: Boolean = false): List[Example] = {
texts.map(text => (Document.fromText(text, enableAnalyzer = enableAnalyzer), simpleCheck(output), weight))
texts.map(text => (Document.fromText(text, enableAnalyzer = enableAnalyzer), output))
}

def simpleCheck(value: ResolvedValue): TestPredicate =
(doc: Document, _: Context) =>
(resolvedToken: ResolvedToken) => {
val expected = write(value)
val actual = write(resolvedToken.value)
val equals = expected == actual
if (!equals && testOptions.debug) {
logger.debug(s"checking: ${doc.rawInput}")
logger.debug(s"expected ${if (expected == actual) "=" else "!="} actual")
logger.debug(s"expected: $expected")
logger.debug(s"actual : $actual")
}
equals
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object TimeAnswerSizeDetector {
val examples = Time.allExamples
val suspicious = examples
.map {
case (doc, _, _) =>
case (doc, _) =>
Api.analyze(doc.rawInput, context, options)
}
.filter(_.size > 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ package com.xiaomi.duckling.task

import java.io.File

import com.typesafe.scalalogging.LazyLogging

import com.xiaomi.duckling.ranking.NaiveBayesRank

object Training {
object Training extends LazyLogging {
def main(args: Array[String]): Unit = {
println(new File("").getAbsolutePath)
logger.info("working directory: " + new File("").getAbsolutePath)
NaiveBayesRank.main(Array("src/main/resources/naive_bayes.kryo"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import com.typesafe.scalalogging.LazyLogging
import com.xiaomi.duckling.Api.analyze
import com.xiaomi.duckling.ranking.Testing.{testContext, testOptions}
import com.xiaomi.duckling.task.NaiveBayesDebug
import com.xiaomi.duckling.UnitSpec
import com.xiaomi.duckling.{JsonSerde, UnitSpec}

class GeneralCaseTest
extends UnitSpec
Expand All @@ -50,10 +50,10 @@ class GeneralCaseTest

it(s"${dim.name} - cases") {
forAll(corpusTable) {
case (doc, predicate, _) =>
case (doc, rv) =>
val candidates = analyze(doc.rawInput, testContext, options)
val found = candidates.zipWithIndex.find {
case (c, _) => predicate(doc, testContext)(c.token)
case (c, _) => JsonSerde.simpleCheck(doc, c.token, rv)
}
val matches = found match {
case Some((_, 0)) => logger.info(s"${doc.rawInput}"); true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package com.xiaomi.duckling.dimension

import com.xiaomi.duckling.{Api, UnitSpec}
import com.xiaomi.duckling.dimension.numeral.Numeral
import com.xiaomi.duckling.dimension.time.Time
import com.xiaomi.duckling.ranking.Testing.testContext
import com.xiaomi.duckling.Types.Options

class LiteDimensionsTest extends UnitSpec {
describe("LiteTest") {
it("should analyze") {
val options = Options(targets = Set(Time, Numeral))
Api.analyze("今天的天气怎么样123", testContext, options) should have size 2
}
}
}
18 changes: 13 additions & 5 deletions duckling-fork-chinese/project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import sbt._

object Dependencies {
lazy val testDependencies =
Seq(junit % Test, junitInterface % Test, scalaTest % Test, hamcrest % Test)
Seq(junit, junitInterface, scalaTest, hamcrest)

lazy val coreDependencies = Seq(
scalaz,
Expand All @@ -39,18 +39,26 @@ object Dependencies {
trie,
easyBert,
chill
) ++ testDependencies
)

lazy val serverDependencies = Seq(logback, spStarterWeb, spThymeleaf, reactor, lombok) ++ testDependencies
lazy val serverDependencies = Seq(logback, spStarterWeb, spThymeleaf, reactor, lombok)

lazy val benchmarkDependencies = Seq(scalaTest % Test, jmhAnn, jmhCore, slf4jnop)

lazy val liteExcludes = Seq(
"com.vdurmont" % "emoji-java",
"commons-io" % "commons-io",
"org.apache.commons" % "commons-text",
"org.apache.commons" % "commons-lang3",
"org.json4s" %% "json4s-jackson"
)

// test
lazy val junit = "junit" % "junit" % "4.13.2"
lazy val hamcrest = "org.hamcrest" % "hamcrest" % "2.2"
lazy val junitInterface = "com.novocode" % "junit-interface" % "0.11"
lazy val scalatic = "org.scalactic" %% "scalactic" % "3.2.10"
lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.2.10"
lazy val scalatic = "org.scalactic" %% "scalactic" % "3.2.14"
lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.2.14"
lazy val scalaMeter = "com.storm-enroute" %% "scalameter" % "0.18"
lazy val jmhAnn = "org.openjdk.jmh" % "jmh-generator-annprocess" % "1.21"
lazy val jmhCore = "org.openjdk.jmh" % "jmh-core" % "1.21"
Expand Down
2 changes: 1 addition & 1 deletion duckling-fork-chinese/version.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ThisBuild / version := "1.2-SNAPSHOT"
ThisBuild / version := "1.3-SNAPSHOT"

0 comments on commit 7dfd37c

Please sign in to comment.