All Downloads are FREE. Search and download functionalities are using the official Maven repository.

me.vertretungsplan.parser.DaVinciParser Maven / Gradle / Ivy

/*
 * substitution-schedule-parser - Java library for parsing schools' substitution schedules
 * Copyright (c) 2016 Johan v. Forstner
 *
 * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
 * If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

package me.vertretungsplan.parser;

import me.vertretungsplan.exception.CredentialInvalidException;
import me.vertretungsplan.objects.Substitution;
import me.vertretungsplan.objects.SubstitutionSchedule;
import me.vertretungsplan.objects.SubstitutionScheduleData;
import me.vertretungsplan.objects.SubstitutionScheduleDay;
import org.apache.http.client.HttpResponseException;
import org.jetbrains.annotations.NotNull;
import org.joda.time.LocalDate;
import org.joda.time.LocalDateTime;
import org.joda.time.format.DateTimeFormat;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Parser for substitution schedules in HTML format created by the DaVinci
 * software. Supports DaVinci 6 and 5 (the latter was not tested in depth yet).
 * 

* This parser can be accessed using "davinci" for {@link SubstitutionScheduleData#setApi(String)}. * *

Configuration parameters

* These parameters can be supplied in {@link SubstitutionScheduleData#setData(JSONObject)} to configure the parser: * *
*
url (String, required if urls not specified)
*
The URL of the home page of the DaVinci HTML export can be found. This can either be a schedule for a single * day or an overview page with a selection of classes or days (in both calendar and list views)
* *
urls (Array of strings, required if url not specified)
*
The URLs of the home page of the DaVinci HTML export can be found. This can either be a schedule for a single * day or an overview page with a selection of classes or days (in both calendar and list views)
* *
classes (Array of Strings, required if classesSource not specified)
*
The list of all classes, as they can appear in the schedule
* *
classesSource (String, optional)
*
The URL of the homepage of a DaVinci timetable, showing the list of all available classes
* *
website (String, recommended)
*
The URL of a website where the substitution schedule can be seen online
*
* *
embeddedContentSelector (String, optional)
*
When the DaVinci schedule is embedded in another HTML file using server-side code, you can use this to * specify which HTML elements should be considered as the containers for the DaVinci schedule. The CSS selector * syntax is supported as specified by * JSoup.
* * * Additionally, this parser supports the parameters specified in {@link LoginHandler} for login-protected schedules. */ public class DaVinciParser extends BaseParser { private static final String ENCODING = "UTF-8"; private static final String PARAM_URL = "url"; private static final String PARAM_URLS = "urls"; private static final String PARAM_CLASSES_SOURCE = "classesSource"; private static final String PARAM_EMBEDDED_CONTENT_SELECTOR = "embeddedContentSelector"; private static final String PARAM_WEBSITE = "website"; public DaVinciParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) { super(scheduleData, cookieProvider); } static void parseDaVinciTable(Element table, SubstitutionSchedule v, ColorProvider colorProvider) throws IOException { parseDaVinciTable(table, v, null, null, colorProvider); } static void parseDaVinciTable(Element table, SubstitutionSchedule v, SubstitutionScheduleDay day, ColorProvider colorProvider) throws IOException { parseDaVinciTable(table, v, null, day, colorProvider); } static void parseDaVinciTable(Element table, SubstitutionSchedule v, String klasse, SubstitutionScheduleDay day, ColorProvider colorProvider) throws IOException { boolean skipRow = false; List headers = new ArrayList<>(); for (Element header : table.select("thead tr th")) { headers.add(header.text()); } if (headers.size() == 0) { skipRow = true; for (Element header : table.select(" tr:first-child td")) { headers.add(header.text()); } } // These three variables can Set classes = new HashSet<>(); String lesson = null; LocalDate currentDate = null; Pattern previousCurrentPattern = Pattern.compile("\\+([^\\s]+) \\(([^)]+)\\)"); Pattern previousPattern = Pattern.compile("\\(([^)]+)\\)"); for (Element row : table.select("tr:not(thead tr)")) { if (skipRow) { skipRow = false; continue; } Substitution subst = new Substitution(); LocalDate substDate = null; Elements columns = row.select("td"); for (int i = 0; i < headers.size(); i++) { String value = columns.get(i).text().replace("\u00a0", ""); String header = headers.get(i); if (value.isEmpty()) { if (header.equals("Klasse")) subst.setClasses(new HashSet<>(classes)); if (header.equals("Pos") || header.equals("Stunde") || header.equals("Std.") || header.equals ("Dstd.") || header.equals("UE")) { subst.setLesson(lesson); } if (header.equals("Art") || header.equals("Merkmal")) subst.setType("Vertretung"); if (header.equals("Datum")) substDate = currentDate; continue; } Matcher previousCurrentMatcher = previousCurrentPattern.matcher(value); Matcher previousMatcher = previousPattern.matcher(value); switch (header) { case "Klasse": String classesStr = value; if (previousMatcher.find()) { classesStr = previousMatcher.group(1); } classes = new HashSet<>(Arrays.asList(classesStr.split(", "))); subst.setClasses(classes); break; case "Pos": case "Stunde": case "Std.": case "Dstd.": case "UE": lesson = value; subst.setLesson(lesson); break; case "VLehrer": case "VLehrer Kürzel": case "VLehrer Name": case "Vertreter": case "Vertretungslehrkraft": if (!value.startsWith("*")) { subst.setTeacher(value); } else { subst.setType(value.substring(1)); } break; case "Lehrer": case "Lehrer Kürzel": case "Lehrer Name": case "Lehrkraft": if (previousCurrentMatcher.find()) { subst.setTeacher(previousCurrentMatcher.group(1)); subst.setPreviousTeacher(previousCurrentMatcher.group(2)); } else if (previousMatcher.find()) { subst.setPreviousTeacher(previousMatcher.group(1)); } else { subst.setPreviousTeacher(value); } break; case "VFach": case "V Fach": subst.setSubject(value); break; case "Fach": case "Original Fach": if (previousCurrentMatcher.find()) { subst.setSubject(previousCurrentMatcher.group(1)); subst.setPreviousSubject(previousCurrentMatcher.group(2)); } else { subst.setPreviousSubject(value); } break; case "VRaum": case "V Raum": case "Vertretungs Raum": subst.setRoom(value); break; case "Raum": case "Original Raum": if (previousCurrentMatcher.find()) { subst.setRoom(previousCurrentMatcher.group(1)); subst.setPreviousRoom(previousCurrentMatcher.group(2)); } else { subst.setPreviousRoom(value); } break; case "Art": case "Merkmal": case "Information": subst.setType(value); break; case "Info": case "Mitteilung": case "Bemerkung": subst.setDesc(value); break; case "Datum": substDate = ParserUtils.parseDate(value); currentDate = substDate; break; } } if (klasse != null) { Set fixedClasses = new HashSet<>(); fixedClasses.add(klasse); subst.setClasses(fixedClasses); } if (subst.getType() == null) { String recognizedType = null; if (subst.getDesc() != null) recognizedType = recognizeType(subst.getDesc()); subst.setType(recognizedType != null ? recognizedType : "Vertretung"); } subst.setColor(colorProvider.getColor(subst.getType())); if (substDate == null && day == null) continue; if (day == null || substDate != null && !substDate.equals(day.getDate())) { day = null; for (SubstitutionScheduleDay d : v.getDays()) { if (d.getDate().equals(substDate)) { day = d; } } if (day == null) { day = new SubstitutionScheduleDay(); day.setDate(substDate); v.addDay(day); } } day.addSubstitution(subst); } } @Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); SubstitutionSchedule schedule = SubstitutionSchedule.fromData(scheduleData); List urls = new ArrayList<>(); if (scheduleData.getData().has(PARAM_URLS)) { JSONArray urlsArray = scheduleData.getData().getJSONArray(PARAM_URLS); for (int i = 0; i < urlsArray.length(); i++) { urls.add(urlsArray.getString(i)); } } else { urls.add(scheduleData.getData().getString(PARAM_URL)); } int successfulSchedules = 0; IOException lastException = null; for (String url:urls) { try { Document doc = Jsoup.parse(httpGet(url, ENCODING)); List dayUrls = getDayUrls(url, doc); if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) { for (Element el : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) { parsePage(el, schedule, colorProvider); } } else { for (String dayUrl : dayUrls) { Document dayDoc; if (dayUrl.equals(url)) { dayDoc = doc; } else { dayDoc = Jsoup.parse(httpGet(dayUrl, ENCODING)); } parsePage(dayDoc, schedule, colorProvider); } } successfulSchedules ++; } catch (IOException e) { lastException = e; } } if (successfulSchedules == 0 && lastException != null) { throw lastException; } if (scheduleData.getData().has(PARAM_WEBSITE)) { schedule.setWebsite(scheduleData.getData().getString(PARAM_WEBSITE)); } else { schedule.setWebsite(urls.get(0)); } schedule.setClasses(getAllClasses()); schedule.setTeachers(getAllTeachers()); return schedule; } @NotNull static List getDayUrls(String url, Document doc) throws IOException { List dayUrls = new ArrayList<>(); if (doc.select("ul.classes").size() > 0) { // List of classes Elements classes = doc.select("ul.classes li a"); for (Element klasse : classes) { dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString()); } } else if (doc.select("ul.month").size() > 0) { // List of days in calendar view Elements days = doc.select("ul.month li input[onclick]"); for (Element day : days) { String urlFromOnclick = urlFromOnclick(day.attr("onclick")); if (urlFromOnclick == null) continue; dayUrls.add(new URL(new URL(url), urlFromOnclick).toString()); } } else if (doc.select("ul.day-index").size() > 0) { // List of days in list view Elements days = doc.select("ul.day-index li a"); for (Element day : days) { dayUrls.add(new URL(new URL(url), day.attr("href")).toString()); } } else if (doc.select("table td[align=left] a").size() > 0) { // Table of classes (DaVinci 5) Elements classes = doc.select("table td[align=left] a"); for (Element klasse : classes) { dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString()); } } else { // Single day dayUrls.add(url); } return dayUrls; } private static String urlFromOnclick(String onclick) { Pattern pattern = Pattern.compile("window\\.location\\.href='([^']+)'"); Matcher matcher = pattern.matcher(onclick); if (matcher.find()) { return matcher.group(1); } else { return null; } } @NotNull static void parsePage(Element doc, SubstitutionSchedule schedule, ColorProvider colorProvider) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); Element titleElem; if (doc.select("h1.list-table-caption").size() > 0) { titleElem = doc.select("h1.list-table-caption").first(); } else { // DaVinci 5 titleElem = doc.select("h2").first(); } String title = titleElem.text(); String klasse = null; // title can either be date or class Pattern datePattern = Pattern.compile("\\d+\\.\\d+.\\d{4}"); Matcher dateMatcher = datePattern.matcher(title); if (dateMatcher.find()) { day.setDateString(dateMatcher.group()); day.setDate(ParserUtils.parseDate(dateMatcher.group())); } else { klasse = title; String nextText = titleElem.nextElementSibling().text(); if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) { day.setDateString(nextText); day.setDate(ParserUtils.parseDate(nextText)); } else { // could not find date, must be multiple days day = null; } } for (Element p : doc.select(".row:has(h1.list-table-caption) p")) { for (TextNode node : p.textNodes()) { if (!node.text().trim().isEmpty() && day != null) day.addMessage(node.text().trim()); } } for (Element message : doc.select(".callout")) { for (TextNode node : message.textNodes()) { if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim()); } } Element lastChangeElem = doc.select(".row.copyright div").first(); if (lastChangeElem == null) { // DaVinci 5 lastChangeElem = doc.select("h1").first(); } if (lastChangeElem != null) { String lastChange = lastChangeElem.ownText(); Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|"); Matcher matcher = pattern.matcher(lastChange); if (matcher.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd-MM-yyyy HH:mm").parseLocalDateTime(matcher.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } else { Pattern pattern2 = Pattern.compile("(\\d{2}.\\d{2}.\\d{4} \\| \\d+:\\d{2})"); Matcher matcher2 = pattern2.matcher(lastChange); if (matcher2.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm").parseLocalDateTime(matcher2.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } } } else { Pattern pattern = Pattern.compile(""); Matcher matcher = pattern.matcher(doc.html()); if (matcher.find()) { String str = matcher.group(1); LocalDateTime date = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm").parseLocalDateTime(str); if (day != null) { day.setLastChange(date); } else { schedule.setLastChange(date); } } } if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) { Element table = doc.select(".list-table, table").first(); parseDaVinciTable(table, schedule, klasse, day, colorProvider); } if (day != null) { schedule.addDay(day); } } @Override public List getAllClasses() throws IOException, JSONException, CredentialInvalidException { if (scheduleData.getData().has(PARAM_CLASSES_SOURCE)) { Document doc = Jsoup.parse(httpGet(scheduleData.getData().getString("classesSource"), ENCODING)); List classes = new ArrayList<>(); Elements elems = doc.select("li.Class"); if (elems.size() == 0) { // daVinci 5 elems = doc.select("td[align=left] a"); } for (Element li : elems) { classes.add(li.text()); } return classes; } else { return getClassesFromJson(); } } @Override public List getAllTeachers() { return null; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy