
tri.timeseries.TimeSeriesProcessor.kt Maven / Gradle / Ivy
/*-
* #%L
* coda-data
* --
* Copyright (C) 2020 - 2021 Elisha Peterson
* --
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package tri.timeseries
import tri.area.AreaInfo
import tri.covid19.data.IhmeForecasts
import tri.util.ansiYellow
import java.io.File
import java.io.FileOutputStream
import java.net.URL
import java.nio.charset.Charset
import kotlin.time.ExperimentalTime
import kotlin.time.measureTimedValue
/** Tool that supports both reading and processing input data to a normalized format, and storing that data locally so next time it can be more quickly retrieved. */
abstract class TimeSeriesProcessor {
/** Load data by source. */
fun data(source: String? = null) = loadProcessedData()?.bySource(source) ?: reloadRawData().bySource(source)
//region DATA LOADING
/** Forces data to be reprocessed from source files. */
fun reloadRawData(): List {
val raw = loadRaw()
if (raw.isNotEmpty()) {
processingNote("Loaded raw data. Now saving ${raw.size} time series using ${this::class.simpleName}")
saveProcessed(raw)
return raw
}
throw IllegalStateException("Could not find data")
}
/** Loads already processed data, if present. */
fun loadProcessedData(): List? {
val processed = loadProcessed()
if (processed.isNotEmpty()) {
return processed
}
return null
}
//endregion
/** List of metric/qualifier pairs provided by this processor. */
abstract fun metricsProvided(): Set
/** Filter indicating whether the given data is provided by this processor. Override to limit areas. */
open fun provides(area: AreaInfo, metric: String, qualifier: String) = MetricInfo(metric, qualifier) in metricsProvided()
/** Load data from original source. */
abstract fun loadRaw(): List
/** Saves processed data, so it can be retrieved more quickly later. */
abstract fun saveProcessed(data: List)
/** Load data from local source/cache, if possible. */
abstract fun loadProcessed(): List
private fun List.bySource(source: String? = null) = if (source == null) this else filter { it.source == source }
}
/** Inprocesses files from a "raw" source and saves them to a processed file location. */
@ExperimentalTime
abstract class TimeSeriesCachingProcessor(val processed: () -> File): TimeSeriesProcessor() {
override fun loadProcessed(): List {
val file = processed()
return if (file.exists()) {
measureTimedValue {
if (Charset.defaultCharset() != Charsets.UTF_8) {
processingNote("Default charset is ${Charset.defaultCharset()}; loading files with UTF-8 instead.")
}
TimeSeriesFileFormat.readSeries(file, Charsets.UTF_8)
}.let {
processingNote("Loaded ${it.value.size} processed time series in ${it.duration} from $file")
it.value
}
} else {
processingNote("Processed file not found -- will reload raw data: $file")
listOf()
}
}
fun deleteProcessedFile() = processed().delete()
override fun saveProcessed(data: List) = TimeSeriesFileFormat.writeSeries(data, FileOutputStream(processed()), Charsets.UTF_8)
open fun process(series: List) = series.regroupAndMax(coerceIncreasing = false)
}
/** Processes raw files to processed files, reads processed files if possible. */
@ExperimentalTime
abstract class TimeSeriesFileProcessor(val rawSources: () -> List, processed: () -> File): TimeSeriesCachingProcessor(processed) {
override fun loadRaw() = process(rawSources().flatMap { file ->
measureTimedValue {
processingNote("Loading data from $file...")
inprocess(file)
}.let {
processingNote("Loaded ${it.value.size} rows in ${it.duration} from $file")
it.value
}
})
abstract fun inprocess(file: File): List
}
/** Processes URLs to processed files, reads processed files if possible. */
@ExperimentalTime
abstract class TimeSeriesUrlProcessor(val rawSources: () -> List, processed: () -> File): TimeSeriesCachingProcessor(processed) {
override fun loadRaw() = process(rawSources().flatMap { url ->
measureTimedValue {
processingNote("Loading data from $url...")
inprocess(url)
}.let {
processingNote("Loaded ${it.value.size} rows in ${it.duration} from $url")
it.value
}
})
abstract fun inprocess(url: URL): List
}
private fun processingNote(text: String) = println("[${ansiYellow("DATA")}] $text")
© 2015 - 2025 Weber Informatics LLC | Privacy Policy