
org.sonar.plugins.python.IpynbNotebookParser Maven / Gradle / Ivy
/*
* SonarQube Python Plugin
* Copyright (C) 2011-2024 SonarSource SA
* mailto:info AT sonarsource DOT com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package org.sonar.plugins.python;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonLocation;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.sonar.python.IPythonLocation;
public class IpynbNotebookParser {
public static final String SONAR_PYTHON_NOTEBOOK_CELL_DELIMITER = "#SONAR_PYTHON_NOTEBOOK_CELL_DELIMITER";
private static final Set ACCEPTED_LANGUAGE = Set.of("python", "ipython");
public static Optional parseNotebook(PythonInputFile inputFile) {
try {
return new IpynbNotebookParser(inputFile).parse();
} catch (IOException e) {
throw new IllegalStateException("Cannot read " + inputFile, e);
}
}
private IpynbNotebookParser(PythonInputFile inputFile) {
this.inputFile = inputFile;
}
private final PythonInputFile inputFile;
private int lastPythonLine = 0;
public Optional parse() throws IOException {
// If the language is not present, we assume it is a Python notebook
var isPythonNotebook = parseLanguage().map(ACCEPTED_LANGUAGE::contains).orElse(true);
return Boolean.TRUE.equals(isPythonNotebook) ? Optional.of(parseNotebook()) : Optional.empty();
}
public Optional parseLanguage() throws IOException {
String content = inputFile.wrappedFile().contents();
JsonFactory factory = new JsonFactory();
try (JsonParser jParser = factory.createParser(content)) {
while (!jParser.isClosed()) {
JsonToken jsonToken = jParser.nextToken();
if (JsonToken.FIELD_NAME.equals(jsonToken) && "language".equals(jParser.currentName())) {
jParser.nextToken();
return Optional.ofNullable(jParser.getValueAsString());
}
}
}
return Optional.empty();
}
public GeneratedIPythonFile parseNotebook() throws IOException {
String content = inputFile.wrappedFile().contents();
boolean isCompressed = content.lines().count() <= 1;
JsonFactory factory = new JsonFactory();
try (JsonParser jParser = factory.createParser(content)) {
return parseCells(jParser, isCompressed).map(notebookData -> {
// Account for EOF token
JsonLocation location = jParser.currentTokenLocation();
notebookData.addDefaultLocation(lastPythonLine, location.getLineNr(), location.getColumnNr());
return new GeneratedIPythonFile(inputFile.wrappedFile(), notebookData.getAggregatedSource().toString(), notebookData.getLocationMap());
}).orElse(new GeneratedIPythonFile(inputFile.wrappedFile(), "", new LinkedHashMap<>()));
}
}
private Optional parseCells(JsonParser parser, boolean isCompressed) throws IOException {
while (!parser.isClosed()) {
parser.nextToken();
String fieldName = parser.currentName();
if ("cells".equals(fieldName)) {
// consume array start token
parser.nextToken();
Optional data = parseCellArray(parser, isCompressed);
parser.close();
return data;
}
}
return Optional.empty();
}
private Optional parseCellArray(JsonParser jParser, boolean isCompressed) throws IOException {
List cellsData = new ArrayList<>();
while (jParser.nextToken() != JsonToken.END_ARRAY) {
if (jParser.currentToken() == JsonToken.START_OBJECT) {
processCodeCell(cellsData, jParser, isCompressed);
}
}
Optional aggregatedNotebookData = cellsData.stream().reduce(NotebookParsingData::combine);
aggregatedNotebookData.ifPresent(NotebookParsingData::removeTrailingExtraLine);
return aggregatedNotebookData;
}
private static void skipNestedObjects(JsonParser parser) throws IOException {
if (parser.currentToken() == JsonToken.START_OBJECT || parser.currentToken() == JsonToken.START_ARRAY) {
parser.skipChildren();
}
}
private static boolean processCodeCellType(JsonParser jParser) throws IOException {
if (JsonToken.FIELD_NAME.equals(jParser.currentToken()) && "cell_type".equals(jParser.currentName())) {
jParser.nextToken();
if ("code".equals(jParser.getValueAsString())) {
return true;
}
}
return false;
}
private void processCodeCell(List accumulator, JsonParser jParser, boolean isCompressed) throws IOException {
boolean isCodeCell = false;
Optional notebookData = Optional.empty();
while (jParser.nextToken() != JsonToken.END_OBJECT) {
skipNestedObjects(jParser);
if (!isCodeCell) {
isCodeCell = processCodeCellType(jParser);
}
if (JsonToken.FIELD_NAME.equals(jParser.currentToken()) && "source".equals(jParser.currentName())) {
jParser.nextToken();
int startLine = 0;
if (!accumulator.isEmpty()) {
startLine = accumulator.get(accumulator.size() - 1).getAggregatedSourceLine();
}
switch (jParser.currentToken()) {
case START_ARRAY:
notebookData = Optional.of(parseSourceArray(startLine, jParser, isCompressed));
break;
case VALUE_STRING:
notebookData = Optional.of(parseSourceMultilineString(startLine, jParser));
break;
default:
throw new IllegalStateException("Unexpected token: " + jParser.currentToken());
}
}
}
if (isCodeCell && notebookData.isPresent()) {
var data = notebookData.get();
lastPythonLine = data.getAggregatedSourceLine();
accumulator.add(data);
}
}
private static NotebookParsingData parseSourceArray(int startLine, JsonParser jParser, boolean isCompressed) throws IOException {
NotebookParsingData cellData = NotebookParsingData.fromLine(startLine);
JsonLocation tokenLocation = jParser.currentTokenLocation();
// In case of an empty cell, we don't add an extra line
var lastSourceLine = "\n";
while (jParser.nextToken() != JsonToken.END_ARRAY) {
String sourceLine = jParser.getValueAsString();
var newTokenLocation = jParser.currentTokenLocation();
var countEscapedChar = countEscapeCharacters(sourceLine, newTokenLocation.getColumnNr());
cellData.addLineToSource(sourceLine, newTokenLocation.getLineNr(), newTokenLocation.getColumnNr(), countEscapedChar, isCompressed);
lastSourceLine = sourceLine;
tokenLocation = newTokenLocation;
}
if (!lastSourceLine.endsWith("\n")) {
cellData.appendToSource("\n");
}
// Account for the last cell delimiter
cellData.addDelimiterToSource(SONAR_PYTHON_NOTEBOOK_CELL_DELIMITER + "\n", tokenLocation.getLineNr(), tokenLocation.getColumnNr());
return cellData;
}
private static NotebookParsingData parseSourceMultilineString(int startLine, JsonParser jParser) throws IOException {
NotebookParsingData cellData = NotebookParsingData.fromLine(startLine);
String sourceLine = jParser.getValueAsString();
JsonLocation tokenLocation = jParser.currentTokenLocation();
var previousLen = 0;
var previousExtraChars = 0;
var isFirstLine = true;
for (String line : sourceLine.lines().toList()) {
var countEscapedChar = countEscapeCharacters(line, previousLen + previousExtraChars + tokenLocation.getColumnNr());
var currentCount = countEscapedChar.get(-1);
cellData.addLineToSource(line, new IPythonLocation(tokenLocation.getLineNr(),
tokenLocation.getColumnNr() + previousLen + previousExtraChars, countEscapedChar, true));
cellData.appendToSource("\n");
previousLen = previousLen + line.length() + 2;
previousExtraChars = previousExtraChars + currentCount;
if (isFirstLine) {
isFirstLine = false;
previousLen += 1;
}
}
// Account for the last cell delimiter
cellData.addDelimiterToSource(SONAR_PYTHON_NOTEBOOK_CELL_DELIMITER + "\n", tokenLocation.getLineNr(), tokenLocation.getColumnNr());
return cellData;
}
private static Map countEscapeCharacters(String sourceLine, int colOffSet) {
Map colMap = new LinkedHashMap<>();
int count = 0;
var numberOfExtraChars = 0;
var arr = sourceLine.toCharArray();
for (int i = 0; i < sourceLine.length(); ++i) {
char c = arr[i];
switch (c) {
case '"', '\\':
numberOfExtraChars++;
colMap.put(i, i + colOffSet + count + numberOfExtraChars);
break;
// we never encounter \n or \r as the lines are split at these characters
case '\b', '\f', '\t':
// we increase the count of one char as we count the \ but not the t or b
count += 1;
break;
default:
break;
}
}
colMap.put(-1, numberOfExtraChars);
return colMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy