me.vertretungsplan.parser.DaVinciParser Maven / Gradle / Ivy
/*
* substitution-schedule-parser - Java library for parsing schools' substitution schedules
* Copyright (c) 2016 Johan v. Forstner
*
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
* If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
package me.vertretungsplan.parser;
import me.vertretungsplan.exception.CredentialInvalidException;
import me.vertretungsplan.objects.Substitution;
import me.vertretungsplan.objects.SubstitutionSchedule;
import me.vertretungsplan.objects.SubstitutionScheduleData;
import me.vertretungsplan.objects.SubstitutionScheduleDay;
import org.apache.http.client.HttpResponseException;
import org.jetbrains.annotations.NotNull;
import org.joda.time.LocalDate;
import org.joda.time.LocalDateTime;
import org.joda.time.format.DateTimeFormat;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Parser for substitution schedules in HTML format created by the DaVinci
* software. Supports DaVinci 6 and 5 (the latter was not tested in depth yet).
*
* This parser can be accessed using "davinci"
for {@link SubstitutionScheduleData#setApi(String)}.
*
*
Configuration parameters
* These parameters can be supplied in {@link SubstitutionScheduleData#setData(JSONObject)} to configure the parser:
*
*
* url
(String, required if urls
not specified)
* - The URL of the home page of the DaVinci HTML export can be found. This can either be a schedule for a single
* day or an overview page with a selection of classes or days (in both calendar and list views)
*
* urls
(Array of strings, required if url
not specified)
* - The URLs of the home page of the DaVinci HTML export can be found. This can either be a schedule for a single
* day or an overview page with a selection of classes or days (in both calendar and list views)
*
* classes
(Array of Strings, required if classesSource
not specified)
* - The list of all classes, as they can appear in the schedule
*
* classesSource
(String, optional)
* - The URL of the homepage of a DaVinci timetable, showing the list of all available classes
*
* website
(String, recommended)
* - The URL of a website where the substitution schedule can be seen online
*
*
* embeddedContentSelector
(String, optional)
* When the DaVinci schedule is embedded in another HTML file using server-side code, you can use this to
* specify which HTML elements should be considered as the containers for the DaVinci schedule. The CSS selector
* syntax is supported as specified by
* JSoup.
*
*
* Additionally, this parser supports the parameters specified in {@link LoginHandler} for login-protected schedules.
*/
public class DaVinciParser extends BaseParser {
private static final String ENCODING = "UTF-8";
private static final String PARAM_URL = "url";
private static final String PARAM_URLS = "urls";
private static final String PARAM_CLASSES_SOURCE = "classesSource";
private static final String PARAM_EMBEDDED_CONTENT_SELECTOR = "embeddedContentSelector";
private static final String PARAM_WEBSITE = "website";
public DaVinciParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) {
super(scheduleData, cookieProvider);
}
static void parseDaVinciTable(Element table, SubstitutionSchedule v, ColorProvider colorProvider)
throws IOException {
parseDaVinciTable(table, v, null, null, colorProvider);
}
static void parseDaVinciTable(Element table, SubstitutionSchedule v, SubstitutionScheduleDay day, ColorProvider
colorProvider) throws IOException {
parseDaVinciTable(table, v, null, day, colorProvider);
}
static void parseDaVinciTable(Element table, SubstitutionSchedule v, String klasse, SubstitutionScheduleDay day,
ColorProvider colorProvider) throws IOException {
boolean skipRow = false;
List headers = new ArrayList<>();
for (Element header : table.select("thead tr th")) {
headers.add(header.text());
}
if (headers.size() == 0) {
skipRow = true;
for (Element header : table.select(" tr:first-child td")) {
headers.add(header.text());
}
}
// These three variables can
Set classes = new HashSet<>();
String lesson = null;
LocalDate currentDate = null;
Pattern previousCurrentPattern = Pattern.compile("\\+([^\\s]+) \\(([^)]+)\\)");
Pattern previousPattern = Pattern.compile("\\(([^)]+)\\)");
for (Element row : table.select("tr:not(thead tr)")) {
if (skipRow) {
skipRow = false;
continue;
}
Substitution subst = new Substitution();
LocalDate substDate = null;
Elements columns = row.select("td");
for (int i = 0; i < headers.size(); i++) {
String value = columns.get(i).text().replace("\u00a0", "");
String header = headers.get(i);
if (value.isEmpty()) {
if (header.equals("Klasse")) subst.setClasses(new HashSet<>(classes));
if (header.equals("Pos") || header.equals("Stunde") || header.equals("Std.") || header.equals
("Dstd.") || header.equals("UE")) {
subst.setLesson(lesson);
}
if (header.equals("Art") || header.equals("Merkmal")) subst.setType("Vertretung");
if (header.equals("Datum")) substDate = currentDate;
continue;
}
Matcher previousCurrentMatcher = previousCurrentPattern.matcher(value);
Matcher previousMatcher = previousPattern.matcher(value);
switch (header) {
case "Klasse":
String classesStr = value;
if (previousMatcher.find()) {
classesStr = previousMatcher.group(1);
}
classes = new HashSet<>(Arrays.asList(classesStr.split(", ")));
subst.setClasses(classes);
break;
case "Pos":
case "Stunde":
case "Std.":
case "Dstd.":
case "UE":
lesson = value;
subst.setLesson(lesson);
break;
case "VLehrer":
case "VLehrer Kürzel":
case "VLehrer Name":
case "Vertreter":
case "Vertretungslehrkraft":
if (!value.startsWith("*")) {
subst.setTeacher(value);
} else {
subst.setType(value.substring(1));
}
break;
case "Lehrer":
case "Lehrer Kürzel":
case "Lehrer Name":
case "Lehrkraft":
if (previousCurrentMatcher.find()) {
subst.setTeacher(previousCurrentMatcher.group(1));
subst.setPreviousTeacher(previousCurrentMatcher.group(2));
} else if (previousMatcher.find()) {
subst.setPreviousTeacher(previousMatcher.group(1));
} else {
subst.setPreviousTeacher(value);
}
break;
case "VFach":
case "V Fach":
subst.setSubject(value);
break;
case "Fach":
case "Original Fach":
if (previousCurrentMatcher.find()) {
subst.setSubject(previousCurrentMatcher.group(1));
subst.setPreviousSubject(previousCurrentMatcher.group(2));
} else {
subst.setPreviousSubject(value);
}
break;
case "VRaum":
case "V Raum":
case "Vertretungs Raum":
subst.setRoom(value);
break;
case "Raum":
case "Original Raum":
if (previousCurrentMatcher.find()) {
subst.setRoom(previousCurrentMatcher.group(1));
subst.setPreviousRoom(previousCurrentMatcher.group(2));
} else {
subst.setPreviousRoom(value);
}
break;
case "Art":
case "Merkmal":
case "Information":
subst.setType(value);
break;
case "Info":
case "Mitteilung":
case "Bemerkung":
subst.setDesc(value);
break;
case "Datum":
substDate = ParserUtils.parseDate(value);
currentDate = substDate;
break;
}
}
if (klasse != null) {
Set fixedClasses = new HashSet<>();
fixedClasses.add(klasse);
subst.setClasses(fixedClasses);
}
if (subst.getType() == null) {
String recognizedType = null;
if (subst.getDesc() != null) recognizedType = recognizeType(subst.getDesc());
subst.setType(recognizedType != null ? recognizedType : "Vertretung");
}
subst.setColor(colorProvider.getColor(subst.getType()));
if (substDate == null && day == null) continue;
if (day == null || substDate != null && !substDate.equals(day.getDate())) {
day = null;
for (SubstitutionScheduleDay d : v.getDays()) {
if (d.getDate().equals(substDate)) {
day = d;
}
}
if (day == null) {
day = new SubstitutionScheduleDay();
day.setDate(substDate);
v.addDay(day);
}
}
day.addSubstitution(subst);
}
}
@Override
public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException {
new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);
SubstitutionSchedule schedule = SubstitutionSchedule.fromData(scheduleData);
List urls = new ArrayList<>();
if (scheduleData.getData().has(PARAM_URLS)) {
JSONArray urlsArray = scheduleData.getData().getJSONArray(PARAM_URLS);
for (int i = 0; i < urlsArray.length(); i++) {
urls.add(urlsArray.getString(i));
}
} else {
urls.add(scheduleData.getData().getString(PARAM_URL));
}
int successfulSchedules = 0;
IOException lastException = null;
for (String url:urls) {
try {
Document doc = Jsoup.parse(httpGet(url, ENCODING));
List dayUrls = getDayUrls(url, doc);
if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) {
for (Element el : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) {
parsePage(el, schedule, colorProvider);
}
} else {
for (String dayUrl : dayUrls) {
Document dayDoc;
if (dayUrl.equals(url)) {
dayDoc = doc;
} else {
dayDoc = Jsoup.parse(httpGet(dayUrl, ENCODING));
}
parsePage(dayDoc, schedule, colorProvider);
}
}
successfulSchedules ++;
} catch (IOException e) {
lastException = e;
}
}
if (successfulSchedules == 0 && lastException != null) {
throw lastException;
}
if (scheduleData.getData().has(PARAM_WEBSITE)) {
schedule.setWebsite(scheduleData.getData().getString(PARAM_WEBSITE));
} else {
schedule.setWebsite(urls.get(0));
}
schedule.setClasses(getAllClasses());
schedule.setTeachers(getAllTeachers());
return schedule;
}
@NotNull
static List getDayUrls(String url, Document doc)
throws IOException {
List dayUrls = new ArrayList<>();
if (doc.select("ul.classes").size() > 0) {
// List of classes
Elements classes = doc.select("ul.classes li a");
for (Element klasse : classes) {
dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString());
}
} else if (doc.select("ul.month").size() > 0) {
// List of days in calendar view
Elements days = doc.select("ul.month li input[onclick]");
for (Element day : days) {
String urlFromOnclick = urlFromOnclick(day.attr("onclick"));
if (urlFromOnclick == null) continue;
dayUrls.add(new URL(new URL(url), urlFromOnclick).toString());
}
} else if (doc.select("ul.day-index").size() > 0) {
// List of days in list view
Elements days = doc.select("ul.day-index li a");
for (Element day : days) {
dayUrls.add(new URL(new URL(url), day.attr("href")).toString());
}
} else if (doc.select("table td[align=left] a").size() > 0) {
// Table of classes (DaVinci 5)
Elements classes = doc.select("table td[align=left] a");
for (Element klasse : classes) {
dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString());
}
} else {
// Single day
dayUrls.add(url);
}
return dayUrls;
}
private static String urlFromOnclick(String onclick) {
Pattern pattern = Pattern.compile("window\\.location\\.href='([^']+)'");
Matcher matcher = pattern.matcher(onclick);
if (matcher.find()) {
return matcher.group(1);
} else {
return null;
}
}
@NotNull
static void parsePage(Element doc, SubstitutionSchedule schedule, ColorProvider colorProvider) throws IOException {
SubstitutionScheduleDay day = new SubstitutionScheduleDay();
Element titleElem;
if (doc.select("h1.list-table-caption").size() > 0) {
titleElem = doc.select("h1.list-table-caption").first();
} else {
// DaVinci 5
titleElem = doc.select("h2").first();
}
String title = titleElem.text();
String klasse = null;
// title can either be date or class
Pattern datePattern = Pattern.compile("\\d+\\.\\d+.\\d{4}");
Matcher dateMatcher = datePattern.matcher(title);
if (dateMatcher.find()) {
day.setDateString(dateMatcher.group());
day.setDate(ParserUtils.parseDate(dateMatcher.group()));
} else {
klasse = title;
String nextText = titleElem.nextElementSibling().text();
if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) {
day.setDateString(nextText);
day.setDate(ParserUtils.parseDate(nextText));
} else {
// could not find date, must be multiple days
day = null;
}
}
for (Element p : doc.select(".row:has(h1.list-table-caption) p")) {
for (TextNode node : p.textNodes()) {
if (!node.text().trim().isEmpty() && day != null) day.addMessage(node.text().trim());
}
}
for (Element message : doc.select(".callout")) {
for (TextNode node : message.textNodes()) {
if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim());
}
}
Element lastChangeElem = doc.select(".row.copyright div").first();
if (lastChangeElem == null) {
// DaVinci 5
lastChangeElem = doc.select("h1").first();
}
if (lastChangeElem != null) {
String lastChange = lastChangeElem.ownText();
Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|");
Matcher matcher = pattern.matcher(lastChange);
if (matcher.find()) {
LocalDateTime lastChangeTime =
DateTimeFormat.forPattern("dd-MM-yyyy HH:mm").parseLocalDateTime(matcher.group(1));
if (day != null) {
day.setLastChange(lastChangeTime);
} else {
schedule.setLastChange(lastChangeTime);
}
} else {
Pattern pattern2 = Pattern.compile("(\\d{2}.\\d{2}.\\d{4} \\| \\d+:\\d{2})");
Matcher matcher2 = pattern2.matcher(lastChange);
if (matcher2.find()) {
LocalDateTime lastChangeTime =
DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm").parseLocalDateTime(matcher2.group(1));
if (day != null) {
day.setLastChange(lastChangeTime);
} else {
schedule.setLastChange(lastChangeTime);
}
}
}
} else {
Pattern pattern = Pattern.compile("");
Matcher matcher = pattern.matcher(doc.html());
if (matcher.find()) {
String str = matcher.group(1);
LocalDateTime date = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm").parseLocalDateTime(str);
if (day != null) {
day.setLastChange(date);
} else {
schedule.setLastChange(date);
}
}
}
if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) {
Element table = doc.select(".list-table, table").first();
parseDaVinciTable(table, schedule, klasse, day, colorProvider);
}
if (day != null) {
schedule.addDay(day);
}
}
@Override
public List getAllClasses() throws IOException, JSONException, CredentialInvalidException {
if (scheduleData.getData().has(PARAM_CLASSES_SOURCE)) {
Document doc = Jsoup.parse(httpGet(scheduleData.getData().getString("classesSource"), ENCODING));
List classes = new ArrayList<>();
Elements elems = doc.select("li.Class");
if (elems.size() == 0) {
// daVinci 5
elems = doc.select("td[align=left] a");
}
for (Element li : elems) {
classes.add(li.text());
}
return classes;
} else {
return getClassesFromJson();
}
}
@Override
public List getAllTeachers() {
return null;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy