All Downloads are FREE. Search and download functionalities are using the official Maven repository.

me.vertretungsplan.parser.DaVinciParser Maven / Gradle / Ivy

Go to download

Java library for parsing schools' substitution schedules. Supports multiple different systems mainly used in the German-speaking countries.

There is a newer version: 1.0.0-beta356
Show newest version
/*
 * substitution-schedule-parser - Java library for parsing schools' substitution schedules
 * Copyright (c) 2016 Johan v. Forstner
 *
 * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
 * If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 */

package me.vertretungsplan.parser;

import me.vertretungsplan.exception.CredentialInvalidException;
import me.vertretungsplan.objects.Substitution;
import me.vertretungsplan.objects.SubstitutionSchedule;
import me.vertretungsplan.objects.SubstitutionScheduleData;
import me.vertretungsplan.objects.SubstitutionScheduleDay;
import org.jetbrains.annotations.NotNull;
import org.joda.time.LocalDateTime;
import org.joda.time.format.DateTimeFormat;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Parser for substitution schedules in HTML format created by the DaVinci
 * software. Probably only supports the newest version (DaVinci 6).
 * 

* This parser can be accessed using "davinci" for {@link SubstitutionScheduleData#setApi(String)}. * *

Configuration parameters

* These parameters can be supplied in {@link SubstitutionScheduleData#setData(JSONObject)} to configure the parser: * *
*
url (String, required)
*
The URL of the home page of the DaVinci HTML export can be found. This can either be a schedule for a single * day or an overview page with a selection of classes or days (in both calendar and list views)
* *
classes (Array of Strings, required if classesSource not specified)
*
The list of all classes, as they can appear in the schedule
* *
classesSource (String, optional)
*
The URL of the homepage of a DaVinci timetable, showing the list of all available classes
*
* *
embeddedContentSelector (String, optional)
*
When the DaVinci schedule is embedded in another HTML file using server-side code, you can use this to * specify which HTML elements should be considered as the containers for the DaVinci schedule. The CSS selector * syntax is supported as specified by * JSoup.
* * * Additionally, this parser supports the parameters specified in {@link LoginHandler} for login-protected schedules. */ public class DaVinciParser extends BaseParser { private static final String ENCODING = "UTF-8"; private static final String PARAM_URL = "url"; private static final String PARAM_CLASSES_SOURCE = "classesSource"; public static final String PARAM_EMBEDDED_CONTENT_SELECTOR = "embeddedContentSelector"; public DaVinciParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) { super(scheduleData, cookieProvider); } static void parseDaVinciTable(Element table, SubstitutionScheduleDay day, ColorProvider colorProvider) { parseDaVinciTable(table, day, null, colorProvider); } static void parseDaVinciTable(Element table, SubstitutionScheduleDay day, String klasse, ColorProvider colorProvider) { List headers = new ArrayList<>(); for (Element header : table.select("thead tr th, tr td[bgcolor=#9999FF]")) { headers.add(header.text()); } // These two variables can Set classes = new HashSet<>(); String lesson = null; Pattern previousCurrentPattern = Pattern.compile("\\+([^\\s]+) \\(([^)]+)\\)"); Pattern previousPattern = Pattern.compile("\\(([^)]+)\\)"); for (Element row : table.select("tr:not(thead tr, tr:has(td[bgcolor=#9999FF]))")) { Substitution subst = new Substitution(); Elements columns = row.select("td"); for (int i = 0; i < headers.size(); i++) { String value = columns.get(i).text().replace("\u00a0", ""); String header = headers.get(i); if (value.isEmpty()) { if (header.equals("Klasse")) subst.setClasses(classes); if (header.equals("Pos") || header.equals("Stunde") || header.equals("Std.")) { subst.setLesson(lesson); } if (header.equals("Art") || header.equals("Merkmal")) subst.setType("Vertretung"); continue; } Matcher previousCurrentMatcher = previousCurrentPattern.matcher(value); Matcher previousMatcher = previousPattern.matcher(value); switch (header) { case "Klasse": classes = new HashSet<>(Arrays.asList(value.split(", "))); subst.setClasses(classes); break; case "Pos": case "Stunde": case "Std.": lesson = value; subst.setLesson(lesson); break; case "VLehrer Kürzel": case "VLehrer": case "Vertreter": if (!value.startsWith("*")) { if (value.equals("Raumänderung")) { subst.setType(value); } else { subst.setTeacher(value); } } break; case "Lehrer": case "Lehrer Kürzel": case "Lehrer Name": if (previousCurrentMatcher.find()) { subst.setTeacher(previousCurrentMatcher.group(1)); subst.setPreviousTeacher(previousCurrentMatcher.group(2)); } else if (previousMatcher.find()) { subst.setPreviousTeacher(previousMatcher.group(1)); } else { subst.setPreviousTeacher(value); } break; case "VFach": subst.setSubject(value); break; case "Fach": case "Original Fach": if (previousCurrentMatcher.find()) { subst.setSubject(previousCurrentMatcher.group(1)); subst.setPreviousSubject(previousCurrentMatcher.group(2)); } else { subst.setPreviousSubject(value); } break; case "VRaum": subst.setRoom(value); break; case "Raum": case "Original Raum": if (previousCurrentMatcher.find()) { subst.setRoom(previousCurrentMatcher.group(1)); subst.setPreviousRoom(previousCurrentMatcher.group(2)); } else { subst.setPreviousRoom(value); } break; case "Art": case "Merkmal": subst.setType(value); break; case "Info": case "Mitteilung": subst.setDesc(value); if (!headers.contains("Art") && !headers.contains("Merkmal")) { } break; } } if (klasse != null) { Set fixedClasses = new HashSet<>(); fixedClasses.add(klasse); subst.setClasses(fixedClasses); } if (subst.getType() == null) { String recognizedType = null; if (subst.getDesc() != null) recognizedType = recognizeType(subst.getDesc()); subst.setType(recognizedType != null ? recognizedType : "Vertretung"); } subst.setColor(colorProvider.getColor(subst.getType())); day.addSubstitution(subst); } } @Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); SubstitutionSchedule schedule = SubstitutionSchedule.fromData(scheduleData); String url = scheduleData.getData().getString(PARAM_URL); Document doc = Jsoup.parse(httpGet(url, ENCODING)); List dayUrls = getDayUrls(url, doc); if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) { for (Element el : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) { schedule.addDay(parseDay(el)); } } else { for (String dayUrl : dayUrls) { Document dayDoc; if (dayUrl.equals(url)) { dayDoc = doc; } else { dayDoc = Jsoup.parse(httpGet(dayUrl, ENCODING)); } schedule.addDay(parseDay(dayDoc)); } } schedule.setWebsite(url); schedule.setClasses(getAllClasses()); schedule.setTeachers(getAllTeachers()); return schedule; } @NotNull static List getDayUrls(String url, Document doc) throws IOException { List dayUrls = new ArrayList<>(); if (doc.select("ul.classes").size() > 0) { // List of classes Elements classes = doc.select("ul.classes li a"); for (Element klasse : classes) { dayUrls.add(new URL(new URL(url), klasse.attr("href")).toString()); } } else if (doc.select("ul.month").size() > 0) { // List of days in calendar view Elements days = doc.select("ul.month li input[onclick]"); for (Element day : days) { String urlFromOnclick = urlFromOnclick(day.attr("onclick")); if (urlFromOnclick == null) continue; dayUrls.add(new URL(new URL(url), urlFromOnclick).toString()); } } else if (doc.select("ul.day-index").size() > 0) { // List of days in list view Elements days = doc.select("ul.day-index li a"); for (Element day : days) { dayUrls.add(new URL(new URL(url), day.attr("href")).toString()); } } else { // Single day dayUrls.add(url); } return dayUrls; } private static String urlFromOnclick(String onclick) { Pattern pattern = Pattern.compile("window\\.location\\.href='([^']+)'"); Matcher matcher = pattern.matcher(onclick); if (matcher.find()) { return matcher.group(1); } else { return null; } } @NotNull SubstitutionScheduleDay parseDay(Element doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); String title = doc.select("h1.list-table-caption").first().text(); String klasse = null; // title can either be date or class if (title.matches("\\w+ \\d+\\.\\d+.\\d{4}")) { day.setDateString(title); day.setDate(ParserUtils.parseDate(title)); } else { klasse = title; String nextText = doc.select("h1.list-table-caption").first().nextElementSibling().text(); if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) { day.setDateString(nextText); day.setDate(ParserUtils.parseDate(nextText)); } else { throw new IOException("Could not find date"); } } for (Element p : doc.select(".row:has(h1.list-table-caption) p")) { for (TextNode node : p.textNodes()) { if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim()); } } for (Element message : doc.select(".callout")) { for (TextNode node : message.textNodes()) { if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim()); } } String lastChange = doc.select(".row.copyright div").first().ownText(); Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|"); Matcher matcher = pattern.matcher(lastChange); if (matcher.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd-MM-yyyy HH:mm").parseLocalDateTime(matcher.group(1)); day.setLastChange(lastChangeTime); } if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) { Element table = doc.select(".list-table").first(); parseDaVinciTable(table, day, klasse, colorProvider); } return day; } @Override public List getAllClasses() throws IOException, JSONException { if (scheduleData.getData().has(PARAM_CLASSES_SOURCE)) { Document doc = Jsoup.parse(httpGet(scheduleData.getData().getString("classesSource"), ENCODING)); List classes = new ArrayList<>(); for (Element li : doc.select("li.Class")) { classes.add(li.text()); } return classes; } else { return getClassesFromJson(); } } @Override public List getAllTeachers() { return null; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy