Skip to content

Commit

Permalink
Merge pull request #109 from DEIB-GECO/out_file_naming
Browse files Browse the repository at this point in the history
Output file naming
  • Loading branch information
acanakoglu authored Apr 10, 2018
2 parents 917ef03 + c5c97b7 commit 6f9e986
Show file tree
Hide file tree
Showing 12 changed files with 56 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ object GMQLExecuteCommand {
System.setProperty("current.date", dateFormat.format(new Date()));

private final val SYSTEM_TMPE_DIR = System.getProperty("java.io.tmpdir")
private final val DEFAULT_SCHEMA_FILE: String = "/test.schema";
private final val DEFAULT_SCHEMA_FILE: String = "/schema.xml";
private final val date = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class DFSRepository extends GMQLRepository with XMLDataSetRepository{
}

FS_Utilities.copyfiletoHDFS(schemaPath,
General_Utilities().getHDFSRegionDir(userName)+ new Path(samples.get(0).name).getParent.toString+ "/test.schema"
General_Utilities().getHDFSRegionDir(userName)+ new Path(samples.get(0).name).getParent.toString+ "/schema.xml"
)

// Set File size
Expand Down Expand Up @@ -148,7 +148,7 @@ class DFSRepository extends GMQLRepository with XMLDataSetRepository{
val hdfspath = conf.get("fs.defaultFS") + General_Utilities().getHDFSRegionDir(userName)

if (dataset.samples.nonEmpty) {
val regex = "(/+)(exp(/+))?([^/]+)$".r
val regex = "(/+)(files(/+))?([^/]+)$".r
val ds_folder = regex.replaceFirstIn(hdfspath + dataset.samples.head.name, "")
fs.delete(new Path(ds_folder), true)

Expand Down Expand Up @@ -194,7 +194,7 @@ class DFSRepository extends GMQLRepository with XMLDataSetRepository{
None
}).toList.asJava;
val schema =
readSchemaFile(dsPath + "/test.schema")
readSchemaFile(dsPath + "/schema.xml")
(samples,schema.fields.asJava)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class LFSRepository extends GMQLRepository with XMLDataSetRepository{
}
).map(x=> new GMQLSample(x.getPath)).toList.asJava

val schema = readSchemaFile(dsPath+ "/test.schema")
val schema = readSchemaFile(dsPath+ "/schema.xml")
(samples,schema.fields.asJava)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ object CLI {
System.setProperty("current.date", dateFormat.format(new Date()));

private final val SYSTEM_TMPE_DIR: String = System.getProperty("java.io.tmpdir")
private final val DEFAULT_SCHEMA_FILE:String = "/test.schema";
private final val DEFAULT_SCHEMA_FILE:String = "/schema.xml";
private final val date: String = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());

private final val usage: String = "GMQL-Submit " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class GMQLJob(val gMQLContext: GMQLContext, val script:GMQLScript, val username:
//todo: find a better way to avoid accesing hdfs at compilation time
val newPath =
if(Utilities().LAUNCHER_MODE equals Utilities().REMOTE_CLUSTER_LAUNCHER)
General_Utilities().getSchemaDir(user) + p.path + ".schema"
General_Utilities().getSchemaDir(user) + p.path + ".xml"
else getRegionFolder(p.path, user)
println(newPath)
new VariablePath(newPath, p.parser_name);
Expand All @@ -157,7 +157,7 @@ class GMQLJob(val gMQLContext: GMQLContext, val script:GMQLScript, val username:
val user = if (repositoryHandle.DSExistsInPublic(p.IDName)) "public" else this.username
val newPath =
if(Utilities().LAUNCHER_MODE equals Utilities().REMOTE_CLUSTER_LAUNCHER)
General_Utilities().getSchemaDir(user) + p.IDName + ".schema"
General_Utilities().getSchemaDir(user) + p.IDName + ".xml"
else getRegionFolder(p.IDName, user)
new VariableIdentifier(newPath);
} else {
Expand Down Expand Up @@ -284,7 +284,7 @@ class GMQLJob(val gMQLContext: GMQLContext, val script:GMQLScript, val username:
val user = if (repositoryHandle.DSExistsInPublic(inputDs)) "public" else this.username
val newPath =
if(Utilities().LAUNCHER_MODE equals Utilities().REMOTE_CLUSTER_LAUNCHER)
General_Utilities().getSchemaDir(user) + inputDs + ".schema"
General_Utilities().getSchemaDir(user) + inputDs + ".xml"
else getRegionFolder(inputDs, user)
newPath
}
Expand Down Expand Up @@ -442,7 +442,7 @@ class GMQLJob(val gMQLContext: GMQLContext, val script:GMQLScript, val username:

outputVariablesList.map { ds =>

val (samples, sch) = repositoryHandle.listResultDSSamples(ds + "/exp/", this.username)
val (samples, sch) = repositoryHandle.listResultDSSamples(ds + "/files/", this.username)

// println("samples")
// samples.asScala foreach println _
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class GmqlServer(var implementation : Implementation, binning_size : Option[Long
/**
* Class used to generate a variable reading from the file system
*
* @param paths The list of paths where the dataset is stored. It must contains two sub-directory named "meta" and "exp"
* @param paths The list of paths where the dataset is stored. It must contains two sub-directory named "meta" and "files"
*/
class UnfinishedREAD(paths : List[String]) {
/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class GMQLSparkExecutor(val binSize : BinSize = BinSize(), val maxBinDistance :

val variableDir = variable.metaDag.asInstanceOf[IRStoreMD].path.toString
val MetaOutputPath = variableDir + "/meta/"
val RegionOutputPath = variableDir + "/exp/"
val RegionOutputPath = variableDir + "/files/"
logger.debug("meta out: "+MetaOutputPath)
logger.debug("region out "+ RegionOutputPath)

Expand Down Expand Up @@ -144,8 +144,8 @@ class GMQLSparkExecutor(val binSize : BinSize = BinSize(), val maxBinDistance :
val profile = Profiler.profile(regions = regionRDD, meta = metaRDD, sc = sc)

try {
val output = fs.create(new Path(variableDir + "/exp/" + "profile.xml"));
val output_web = fs.create(new Path(variableDir + "/exp/" + "web_profile.xml"));
val output = fs.create(new Path(variableDir + "/files/" + "profile.xml"));
val output_web = fs.create(new Path(variableDir + "/files/" + "web_profile.xml"));

val os = new java.io.BufferedOutputStream(output)
val os_web = new java.io.BufferedOutputStream(output_web)
Expand Down Expand Up @@ -316,10 +316,10 @@ class GMQLSparkExecutor(val binSize : BinSize = BinSize(), val maxBinDistance :
}

def storeSchema(schema: String, path : String)= {
val schemaPath = path+"/exp/test.schema"
val schemaPath = path+"/files/schema.xml"
val br = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(schemaPath)), "UTF-8"));
br.write(schema);
br.close();
br.write(schema)
br.close()
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object StoreGTFRD {
val fs = FileSystem.get(dfsPath.toUri(), conf);

val MetaOutputPath = path + "/meta/"
val RegionOutputPath = path + "/exp/"
val RegionOutputPath = path + "/files/"

logger.debug(MetaOutputPath)
logger.debug(RegionOutputPath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ object StoreTABRD {
val meta = executor.implement_md(associatedMeta,sc)

val MetaOutputPath = path + "/meta/"
val RegionOutputPath = path + "/exp/"
val RegionOutputPath = path + "/files/"

logger.debug(MetaOutputPath)
logger.debug(RegionOutputPath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,26 +63,24 @@ object BedParserHelper {
class BedParser(delimiter: String, var chrPos: Int, var startPos: Int, var stopPos: Int, var strandPos: Option[Int], var otherPos: Option[Array[(Int, ParsingType.PARSING_TYPE)]]) extends GMQLLoader[(Long, String), Option[DataTypes.GRECORD], (Long, String), Option[DataTypes.MetaType]] with java.io.Serializable {

private val logger: Logger = LoggerFactory.getLogger(classOf[BedParser])
var parsingType = GMQLSchemaFormat.TAB
var parsingType: GMQLSchemaFormat.Value = GMQLSchemaFormat.TAB
var coordinateSystem: GMQLSchemaCoordinateSystem.Value = GMQLSchemaCoordinateSystem.ZeroBased
final val spaceDelimiter: String = " "
final val semiCommaDelimiter: String = ";"
private var otherPosGTF: Array[(Int, PARSING_TYPE)] = _
private var namePositionMap: Map[String, Int] = _
private var nameTypeMap: Map[String, PARSING_TYPE] = _

//added function in order to calculate GTF parameters once.
def calculateMapParameters(namePosition: Option[Seq[String]] = None): Unit = {
parsingType match {
case GMQLSchemaFormat.GTF =>
if (otherPos.getOrElse(Array.empty).length > 4) otherPosGTF = otherPos.get.tail.tail.tail.tail else otherPosGTF = Array.empty
//assume that namePosition order is same as otherPosGTF
namePositionMap = namePosition.getOrElse(Iterable.empty).map(_.toUpperCase).zipWithIndex.toMap
nameTypeMap = otherPosGTF.map(_._2).zip(namePosition.getOrElse(Seq.empty)).map(a => (a._2.toUpperCase, a._1)).toMap
}
}

// val otherPosExtended = otherPos.getOrElse(Array.empty).map()
@transient
private lazy val otherGtf: Map[String, (PARSING_TYPE, Int)] =
schema
.zipWithIndex
.map(x => x._1._1.toUpperCase -> (x._1._2, x._2))
.toMap

@transient
private lazy val otherGtfSize: Int = otherGtf.size

@deprecated
def calculateMapParameters(namePosition: Option[Seq[String]] = None): Unit = {}

/**
* Meta Data Parser to parse String to GMQL META TYPE (ATT, VALUE)
Expand Down Expand Up @@ -112,7 +110,7 @@ class BedParser(delimiter: String, var chrPos: Int, var startPos: Int, var stopP
val s: Array[String] = t._2.split(delimiter, -1)

val other = parsingType match {
case GMQLSchemaFormat.GTF => {
case GMQLSchemaFormat.GTF =>
// GTF file format definition
// 0) seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
// 1) source - name of the program that generated this feature, or the data source (database or project name)
Expand All @@ -129,30 +127,34 @@ class BedParser(delimiter: String, var chrPos: Int, var startPos: Int, var stopP
val score = parseRegion(ParsingType.DOUBLE, s(5))
val frame = parseRegion(ParsingType.STRING, s(7))

val otherValues = Array.fill[GValue](otherGtfSize)(GNull())
otherValues(0) = source
otherValues(1) = feature
otherValues(2) = score
otherValues(3) = frame

val restValues = Array.fill[GValue](otherPosGTF.length)(GNull())

val values = s(8) split semiCommaDelimiter
values.foreach { value =>
val split = value.trim.split(spaceDelimiter, 2)
if (split.length ==2 ) {
if (split.length == 2) {
val attName = split(0).toUpperCase
if(namePositionMap.contains(attName)) {
if (otherGtf.contains(attName)) {
val attVal = split(1).trim.replaceAll("""^"(.+?)\"$""", "$1")
val schemaPos = namePositionMap(attName)
val parseType = nameTypeMap(attName)
restValues(schemaPos) = parseRegion(parseType, attVal)
val (parseType, arrayPos) = otherGtf(attName)
otherValues(arrayPos) = parseRegion(parseType, attVal)
}
else
logger.warn("Skipped the attribute value, it is not defined in schema " + value)
}
}
Array(source, feature, score, frame) ++ restValues
}
otherValues
case _ =>
//if other position is defined then convert every element into GValue with parseRegion, else return empty array
otherPos.getOrElse(Array.empty).map { case (pos, parseType) => parseRegion(parseType, s(pos)) }
}

Some((new GRecordKey(t._1,
Some((GRecordKey(t._1,
s(chrPos).trim,
if (coordinateSystem == GMQLSchemaCoordinateSystem.OneBased) s(startPos).trim.toLong - 1 else s(startPos).trim.toLong,
s(stopPos).trim.toLong,
Expand All @@ -165,10 +167,12 @@ class BedParser(delimiter: String, var chrPos: Int, var startPos: Int, var stopP
}
catch {
case e: Throwable =>
logger.warn("line:\t" + t._2)
logger.warn("problem: " + e.getClass.getCanonicalName + " - " + e.getCause + " - " + e.getMessage)
logger.warn("Chrom: " + chrPos + "\tStart: " + startPos + "\tStop: " + stopPos + "\tstrand: " + strandPos);
logger.warn("Values: " + otherPos.getOrElse(Array[(Int, ParsingType.PARSING_TYPE)]()).map(x => "(" + x._1 + "," + x._2 + ")").mkString("\t") + "\n" +
"This line can not be casted (check the spacing): \n\t\t" + t);
logger.warn("error: ", e)
None //throw ParsingException.create(t._2, e)
}
}
Expand Down Expand Up @@ -330,7 +334,11 @@ class CustomParser extends BedParser("\t", 0, 1, 2, Some(3), Some(Array((4, Pars
val fs: FileSystem = FileSystem.get(path.toUri(), FSConfig.getConf);

//todo: remove this hard fix used for remote execution
val XMLfile: InputStream = fs.open(new Path(dataset + (if (!dataset.endsWith("schema")) "/test.schema" else "")))
val XMLfile: InputStream =
if (!fs.exists(new Path(dataset + (if (!dataset.endsWith("xml")) "/schema.xml" else ""))))
fs.open(new Path(dataset + (if (!dataset.endsWith("xml")) "/test.schema" else "")))
else
fs.open(new Path(dataset + (if (!dataset.endsWith("xml")) "/schema.xml" else "")))
var schematype = GMQLSchemaFormat.TAB
var coordinatesystem = GMQLSchemaCoordinateSystem.Default
var schema: Array[(String, ParsingType.Value)] = null
Expand Down Expand Up @@ -401,11 +409,6 @@ class CustomParser extends BedParser("\t", 0, 1, 2, Some(3), Some(Array((4, Pars
}.toList


val namePositionMap = valuesPositionsSchema.map(_._1)




val other: Array[(Int, ParsingType.Value)] = if (valuesPositions.length > 0)
Array[(Int, ParsingType.Value)]((1, ParsingType.STRING), (2, ParsingType.STRING), (5, ParsingType.DOUBLE), (7, ParsingType.STRING)) ++ valuesPositions
else
Expand All @@ -417,8 +420,6 @@ class CustomParser extends BedParser("\t", 0, 1, 2, Some(3), Some(Array((4, Pars
strandPos = Some(6)
otherPos = Some(other)

calculateMapParameters(Some(namePositionMap))

this.schema = List(("source", ParsingType.STRING), ("feature", ParsingType.STRING), ("score", ParsingType.DOUBLE), ("frame", ParsingType.STRING)) ++ valuesPositionsSchema
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ object writeMultiOutputFiles{
}

/**
* Move the meta data from the /meta folder to /exp file to be in the same folder with the samples files.
* Move the meta data from the /meta folder to /files file to be in the same folder with the samples files.
*
* @param originalPath [[String]] of the meta directory
*/
Expand All @@ -103,7 +103,7 @@ object writeMultiOutputFiles{
val files = fs.listStatus(new Path(originalPath));
files.foreach { sampleFile =>
val pt = new Path(sampleFile.getPath.toString)
val dist = new Path(new Path(originalPath).getParent.toString+"/exp/"+sampleFile.getPath.getName)
val dist = new Path(new Path(originalPath).getParent.toString+"/files/"+sampleFile.getPath.getName)
if(pt.getName !="_SUCCESS")
fs.rename(pt,dist)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ object ProfilerLoader {
(getSampleID(name), name) }
).toMap[Long, String]

val schemaFile = datasetpath+"/test.schema"
val schemaFile = datasetpath+"/schema.xml"
val parser = (new CustomParser).setSchema(schemaFile)

// Parser functions
Expand Down

0 comments on commit 6f9e986

Please sign in to comment.