Skip to content

Commit 7ea0942

Browse files
committed
Free software
1 parent ac55042 commit 7ea0942

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2704
-0
lines changed

.scalafmt.conf

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
version = "2.7.5"
2+
align = some
3+
maxColumn = 120
4+
continuationIndent.defnSite = 2
5+
assumeStandardLibraryStripMargin = true
6+
docstrings = JavaDoc
7+
lineEndings = preserve
8+
includeCurlyBraceInSelectChains = false
9+
danglingParentheses = true
10+
spaces {
11+
inImportCurlyBraces = true
12+
}
13+
optIn.annotationNewlines = true
14+
15+
rewrite.rules = [SortImports, RedundantBraces]
16+
17+
align.openParenCallSite = false
18+
align.openParenDefnSite = false
19+
newlines.alwaysBeforeTopLevelStatements = true
20+
newlines.penalizeSingleSelectMultiArgList = false
21+
rewrite.redundantBraces.stringInterpolation = true

LICENSE

+674
Large diffs are not rendered by default.

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
# pandora
2+
3+
A Scala library that reads documents with deeply nested data
4+
(e.g. XML, JSON) and produces a set of flat relational tables.

build.sbt

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
version := "0.1"
2+
scalaVersion := Versions.scala
3+
4+
ThisBuild / scapegoatVersion := Versions.scapegoat
5+
scapegoatReports := Seq("xml")
6+
Scapegoat / scalacOptions += "-P:scapegoat:overrideLevels:all=Warning"
7+
8+
lazy val `pandora` = project
9+
.in(file("."))
10+
.settings(
11+
publishArtifact := true,
12+
libraryDependencies ++= Dependencies.json4s,
13+
libraryDependencies += Dependencies.scalaXml,
14+
libraryDependencies ++= Dependencies.specs2
15+
)
16+
.settings(Common.commonSettings: _*)

project/Common.scala

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import sbt.Keys._
2+
import sbt._
3+
4+
object Common {
5+
6+
lazy val commonSettings = Seq(
7+
organization := "com.credimi",
8+
scalacOptions ++= Seq(
9+
"-Xfatal-warnings",
10+
"-Ywarn-unused",
11+
"-feature",
12+
"-deprecation"
13+
)
14+
)
15+
}

project/Dependencies.scala

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import sbt._
2+
3+
object Dependencies {
4+
5+
val specs2 =
6+
Seq("specs2-core", "specs2-matcher-extra", "specs2-scalacheck", "specs2-shapeless")
7+
.map("org.specs2" %% _ % Versions.specs2 % "test")
8+
9+
val json4s = Seq("json4s-native").map("org.json4s" %% _ % Versions.json4s)
10+
11+
val scalaXml = "org.scala-lang.modules" %% "scala-xml" % Versions.scalaXml
12+
}

project/Versions.scala

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
object Versions {
2+
3+
val json4s = "3.6.11"
4+
val scala = "2.13.10"
5+
val scalaXml = "2.0.1"
6+
val scapegoat = "1.4.17"
7+
val specs2 = "4.13.0"
8+
}

project/build.properties

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sbt.version=1.5.5

project/plugins.sbt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
addDependencyTreePlugin
2+
addSbtPlugin("com.sksamuel.scapegoat" % "sbt-scapegoat" % "1.1.1")
3+
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")
4+
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.5")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package com.credimi.pandora.core
2+
3+
trait ByteSerializable[X] {
4+
5+
def bytes(x: X): Array[Byte]
6+
}
7+
8+
object ByteSerializable {
9+
10+
implicit val byteSerializableArrayByte: ByteSerializable[Array[Byte]] =
11+
identity
12+
13+
implicit class Bytes[X: ByteSerializable](x: X) {
14+
15+
def bytes: Array[Byte] =
16+
implicitly[ByteSerializable[X]].bytes(x)
17+
}
18+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package com.credimi.pandora.core
2+
3+
sealed trait Tree[+A, +L] {
4+
5+
def annotation: A
6+
7+
def mapAnnotation[B](f: A => B): Tree[B, L] =
8+
this match {
9+
case Leaf(annotation, leaf) => Leaf(f(annotation), leaf)
10+
case List(annotation, list) => List(f(annotation), list.map(_.mapAnnotation(f)))
11+
case Dict(annotation, dict) => Dict(f(annotation), dict.view.mapValues(_.mapAnnotation(f)).toMap)
12+
}
13+
14+
lazy val subtrees: Seq[Tree[A, L]] =
15+
this match {
16+
case l @ Leaf(_, _) => Seq(l)
17+
case l @ List(_, list) => l +: list.flatMap(_.subtrees)
18+
case d @ Dict(_, dict) => d +: dict.values.toSeq.flatMap(_.subtrees)
19+
}
20+
21+
lazy val annotations: Seq[A] =
22+
subtrees.map(_.annotation)
23+
}
24+
25+
final case class Leaf[A, L](annotation: A, leaf: L) extends Tree[A, L]
26+
27+
sealed trait Node[A, L] extends Tree[A, L]
28+
29+
final case class List[A, L](annotation: A, list: Seq[Tree[A, L]]) extends Node[A, L]
30+
31+
final case class Dict[A, L](annotation: A, dict: collection.immutable.Map[String, Tree[A, L]]) extends Node[A, L]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package com.credimi.pandora.core.annotations
2+
3+
import com.credimi.pandora.core
4+
import com.credimi.pandora.core.ByteSerializable.Bytes
5+
import com.credimi.pandora.core._
6+
import com.credimi.pandora.core.hash.HashFunction
7+
8+
case class Hash[A](hash: core.hash.Hash, annotation: A)
9+
10+
object Hash {
11+
12+
def merkle[A, L: ByteSerializable](tree: Tree[A, L])(implicit hashFunction: HashFunction): Tree[Hash[A], L] =
13+
tree match {
14+
case Leaf(annotation, leaf) =>
15+
val hash = hashFunction(0.toByte +: leaf.bytes)
16+
Leaf(Hash(hash, annotation), leaf)
17+
case List(annotation, list) =>
18+
val merkleList: Seq[Tree[Hash[A], L]] = list.map(merkle[A, L])
19+
val hash = hashFunction(
20+
1.toByte +:
21+
merkleList
22+
.map(_.annotation.hash.bytes)
23+
.foldLeft(Array.empty[Byte])(_ ++ _)
24+
)
25+
List(Hash(hash, annotation), merkleList)
26+
case Dict(annotation, dict) =>
27+
val merkleDict = dict.view.mapValues(merkle[A, L]).toMap
28+
val hash = hashFunction(
29+
2.toByte +:
30+
merkleDict.map { case (k, v) => k.getBytes("UTF-8") ++ v.annotation.hash.bytes }
31+
.foldLeft(Array.empty[Byte])(_ ++ _)
32+
)
33+
Dict(Hash(hash, annotation), merkleDict)
34+
}
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package com.credimi.pandora.core.annotations
2+
3+
import com.credimi.pandora.core
4+
import com.credimi.pandora.core.hash.HashFunction
5+
import com.credimi.pandora.core.path.Step
6+
import com.credimi.pandora.core.{ hash, ByteSerializable, Tree }
7+
8+
case class Ids[A](
9+
id: hash.Hash,
10+
parentId: Option[hash.Hash],
11+
nonListAncestor: Option[Ids.Ancestor],
12+
index: Option[Int],
13+
annotation: A
14+
)
15+
16+
object Ids {
17+
18+
case class Ancestor(
19+
path: core.path.Path,
20+
id: hash.Hash
21+
)
22+
23+
def endOfPath(path: core.path.Path): Option[Step] =
24+
path.steps.lastOption
25+
26+
def nodeId(parentId: Option[hash.Hash], path: core.path.Path, nodeHash: hash.Hash)(implicit
27+
hashFunction: HashFunction
28+
): hash.Hash =
29+
hashFunction(
30+
parentId.toArray.flatMap(_.bytes) ++
31+
endOfPath(path).toArray.flatMap(implicitly[ByteSerializable[Step]].bytes) ++
32+
nodeHash.bytes
33+
)
34+
35+
case class ParentData[A](annotation: Ids[Path[Hash[A]]], isList: Boolean)
36+
37+
def ids[A, L](parentData: Option[ParentData[A]], annotation: Path[Hash[A]])(implicit
38+
hashFunction: HashFunction
39+
): Ids[Path[Hash[A]]] = {
40+
val parentId: Option[hash.Hash] = parentData.map(_.annotation.id)
41+
val nonListParent: Option[Ancestor] =
42+
parentData.flatMap(parentData =>
43+
if (!parentData.isList) Some(Ancestor(parentData.annotation.annotation.path, parentData.annotation.id))
44+
else None
45+
)
46+
val nonListAncestor: Option[Ancestor] = nonListParent orElse parentData.flatMap(_.annotation.nonListAncestor)
47+
val path = annotation.path
48+
val nodeHash = annotation.annotation.hash
49+
Ids(
50+
id = nodeId(parentId = parentId, path = path, nodeHash = nodeHash),
51+
parentId = parentId,
52+
nonListAncestor = nonListAncestor,
53+
index = endOfPath(path).flatMap { case Step.Index(index) => Some(index); case _ => None },
54+
annotation = annotation
55+
)
56+
}
57+
58+
def ids[A, L](
59+
parentData: Option[ParentData[A]]
60+
)(tree: Tree[Path[Hash[A]], L])(implicit hashFunction: HashFunction): Tree[Ids[Path[Hash[A]]], L] =
61+
tree match {
62+
case core.Leaf(annotation, leaf) =>
63+
core.Leaf(annotation = ids(parentData, annotation), leaf = leaf)
64+
case core.List(annotation, list) =>
65+
val nodeAnnotation = ids(parentData, annotation)
66+
core.List(
67+
annotation = nodeAnnotation,
68+
list = list.map(ids(Some(ParentData(annotation = nodeAnnotation, isList = true))))
69+
)
70+
case core.Dict(annotation, dict) =>
71+
val nodeAnnotation = ids(parentData, annotation)
72+
core.Dict(
73+
annotation = nodeAnnotation,
74+
dict = dict.view.mapValues(ids(Some(ParentData[A](annotation = nodeAnnotation, isList = false)))).toMap
75+
)
76+
}
77+
78+
def ids[A, L](tree: Tree[Path[Hash[A]], L])(implicit hashFunction: HashFunction): Tree[Ids[Path[Hash[A]]], L] =
79+
ids(None)(tree)
80+
81+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package com.credimi.pandora.core.annotations
2+
3+
import com.credimi.pandora.core
4+
import com.credimi.pandora.core._
5+
import com.credimi.pandora.core.path.Step
6+
7+
case class Path[A](path: core.path.Path, annotation: A)
8+
9+
object Path {
10+
11+
def paths[A, L](tree: Tree[A, L]): Tree[Path[A], L] =
12+
tree match {
13+
case Leaf(annotation, leaf) =>
14+
Leaf(Path(core.path.Path.empty, annotation), leaf)
15+
case List(annotation, list) =>
16+
def f(child: Tree[Path[A], L], index: Int): Tree[Path[A], L] =
17+
child.mapAnnotation[Path[A]](annotation =>
18+
Path[A](Step.Index(index) +: annotation.path, annotation.annotation)
19+
)
20+
val pathsList: Seq[Tree[Path[A], L]] =
21+
list.map(paths[A, L]).zipWithIndex.map((f _).tupled)
22+
List(Path(core.path.Path.empty, annotation), pathsList)
23+
case Dict(annotation, dict) =>
24+
def f(field: String, child: Tree[Path[A], L]): (String, Tree[Path[A], L]) =
25+
(
26+
field,
27+
child.mapAnnotation(annotation => Path(Step.Field(field) +: annotation.path, annotation.annotation))
28+
)
29+
val pathsDict = dict.view
30+
.mapValues(paths[A, L])
31+
.map((f _).tupled)
32+
.toMap
33+
Dict(Path(core.path.Path.empty, annotation), pathsDict)
34+
}
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
package com.credimi.pandora.core.hash
2+
3+
case class Hash(bytes: Vector[Byte])
4+
5+
object Hash {
6+
7+
def apply(bytes: Byte*): Hash = Hash(Vector(bytes: _*))
8+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package com.credimi.pandora.core.hash
2+
3+
import java.security.MessageDigest
4+
5+
trait HashFunction {
6+
7+
def apply(bytes: Array[Byte]): Hash
8+
}
9+
10+
object HashFunction {
11+
12+
val sha256: HashFunction = { bytes =>
13+
Hash(MessageDigest.getInstance("SHA-256").digest(bytes).toVector)
14+
}
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package com.credimi.pandora
2+
3+
import com.credimi.pandora.core.annotations.Hash.merkle
4+
import com.credimi.pandora.core.annotations.Ids
5+
import com.credimi.pandora.core.annotations.Ids.ids
6+
import com.credimi.pandora.core.annotations.Path.paths
7+
import com.credimi.pandora.core.hash.HashFunction
8+
import com.credimi.pandora.core.table.Table.UpsertTable
9+
import com.credimi.pandora.core.table.Tables.UpsertTables
10+
import com.credimi.pandora.core.table.{ Row, Table, Tables }
11+
import com.credimi.pandora.core.write.Write
12+
13+
package object core {
14+
15+
val singleValueFieldName = "value"
16+
17+
import com.credimi.pandora.core.annotations.{ Hash, Path }
18+
19+
private[core] def annotated[A, L: ByteSerializable](tree: Tree[A, L])(implicit
20+
hashFunction: HashFunction
21+
): Tree[Ids[Path[Hash[A]]], L] =
22+
ids(paths(merkle(tree)))
23+
24+
private[core] def writeStep[L](tables: Tables[L], write: Write[L]): Tables[L] =
25+
tables.upsert(write.tableId)(_.getOrElse(Table.empty).upsert(write.rowId)(Row.fromWrite(write)))
26+
27+
def tables[A, L: ByteSerializable](tree: Tree[A, L], preserveListNodes: Boolean = false)(implicit
28+
hashFunction: HashFunction
29+
): Tables[L] =
30+
annotated(tree).subtrees
31+
.flatMap(tree => Write.write(tree, preserveListNodes = preserveListNodes).toSeq)
32+
.foldLeft[Tables[L]](Tables.empty[L])(writeStep)
33+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package com.credimi.pandora.core.path
2+
3+
case class Path(steps: Seq[Step]) {
4+
5+
def :+(field: Step): Path = Path(steps :+ field)
6+
7+
def +:(field: Step): Path = Path(field +: steps)
8+
9+
lazy val lastStep: Option[Step] = steps.lastOption
10+
11+
def parentPath: Option[Path] =
12+
lastStep.map(_ => Path(steps.dropRight(1)))
13+
}
14+
15+
object Path {
16+
17+
def empty: Path = Path(Seq.empty)
18+
19+
// can't call it apply for erasure issues
20+
def make(steps: Step*): Path = Path(Seq(steps: _*))
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package com.credimi.pandora.core.path
2+
3+
import com.credimi.pandora.core.ByteSerializable
4+
5+
import java.math.BigInteger
6+
7+
sealed trait Step
8+
9+
object Step {
10+
11+
case class Field(field: String) extends Step
12+
case class Index(index: Int) extends Step
13+
14+
implicit val byteSerializableStep: ByteSerializable[Step] = {
15+
case Field(field) => 0.toByte +: field.getBytes("UTF-8")
16+
case Index(index) => 1.toByte +: BigInteger.valueOf(index).toByteArray
17+
}
18+
}

0 commit comments

Comments
 (0)