Skip to content

Commit

Permalink
Refactor: parse entire csv instead of by row
Browse files Browse the repository at this point in the history
  • Loading branch information
lvancraen committed May 2, 2019
1 parent 6883dc6 commit eff587c
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 75 deletions.
135 changes: 61 additions & 74 deletions helpers/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,99 +39,86 @@ function importTable(gtfs, tableName) {
*/

function processGtfsTable(gtfs, fileContent, tableName, indexKeys) {
let table = (indexKeys.setOfItems) ? new Set() : new Map();

Papa.parse(fileContent, {
const parsedFileContent = Papa.parse(fileContent, {
delimiter: ',',
header: true,
skipEmptyLines: true,
step: (row) => { // streams the CSV by row
const item = processGtfsTableRow(gtfs, tableName, row, indexKeys);
if (!item) {
return;
}

if (indexKeys.indexKey) {
table.set(item[indexKeys.indexKey], item);
} else if (indexKeys.firstIndexKey && indexKeys.secondIndexKey) {
if (table.has(item[indexKeys.firstIndexKey]) === false) {
table.set(item[indexKeys.firstIndexKey], new Map());
}

table.get(item[indexKeys.firstIndexKey]).set(item[indexKeys.secondIndexKey], item);
} else if (indexKeys.singleton) {
table = item;
} else if (indexKeys.setOfItems) {
table.add(item);
}
},
});

return table;
}
if (parsedFileContent.errors.length) {
let errorMessage = `Invalid rows in table ${tableName}:\n`;

function processGtfsTableRow(gtfs, tableName, row, indexKeys) {
let processedRow = JSON.parse(JSON.stringify(row));
const rowAsCsv = Papa.unparse(processedRow);

const errorsInRow = processedRow.errors;
if (errorsInRow.length) {
let errorMessage = `Invalid row in table ${tableName}:
Line: ${errorsInRow[0].row}
${rowAsCsv}\n\n`;
errorsInRow.forEach((error) => {
errorMessage += `Issue: ${error.message}`;
parsedFileContent.errors.forEach((error) => {
errorMessage += `Line: ${error.row}
Issue: ${error.message}
Row: ${parsedFileContent.data[error.row].join(',')}`;
});

const errorTypes = new Set(errorsInRow.map(error => error.type));
if (gtfs._shouldThrow === true && !errorTypes.has('FieldMismatch')) {
if (gtfs._shouldThrow === true) {
throw new Error(errorMessage);
}

errorMessage += '\nError in CSV was fixed by parser.';
process.notices.addWarning('Invalid CSV', errorMessage);
processedRow = Papa.parse(rowAsCsv, { // fix FieldMismatch errors (TooFewFields / TooManyFields)
delimiter: ',',
header: true,
});
}

const [keys, ...rows] = parsedFileContent.data;

checkThatKeysIncludeIndexKeys(keys, indexKeys, tableName);

const trimmedKeys = keys.map(key => key.trim());
const GtfsRow = createGtfsClassForKeys(trimmedKeys);

return processGtfsTableRows(gtfs, tableName, trimmedKeys, rows, indexKeys, GtfsRow);
}

function processGtfsTableRows(gtfs, tableName, keys, rows, indexKeys, GtfsRow) {
let table = (indexKeys.setOfItems) ? new Set() : new Map();

const regexPatternObjects = gtfs._regexPatternObjectsByTableName.get(tableName);
if (regexPatternObjects) {
processedRow = applyRegexPatternObjectsByTableName(regexPatternObjects, rowAsCsv, processedRow, tableName);
}

const rowObject = {};
for (const [field, value] of Object.entries(processedRow.data[0])) {
rowObject[field.trim()] = value.trim();
}
rows.forEach((row) => {
if (regexPatternObjects) {
row = applyRegexPatternObjectsByTableName(regexPatternObjects, keys, row, tableName);
}

const trimmedRow = row.map(value => value.trim());
const gtfsRow = new GtfsRow(trimmedRow);

if (indexKeys.indexKey) {
table.set(gtfsRow[indexKeys.indexKey], gtfsRow);
} else if (indexKeys.firstIndexKey && indexKeys.secondIndexKey) {
if (table.has(gtfsRow[indexKeys.firstIndexKey]) === false) {
table.set(gtfsRow[indexKeys.firstIndexKey], new Map());
}

checkThatKeysIncludeIndexKeys(Object.keys(rowObject), indexKeys, tableName);
table.get(gtfsRow[indexKeys.firstIndexKey]).set(gtfsRow[indexKeys.secondIndexKey], gtfsRow);
} else if (indexKeys.singleton) {
table = gtfsRow;
} else if (indexKeys.setOfItems) {
table.add(gtfsRow);
}
});

return createGtfsObjectFromSimpleObject(rowObject);
return table;
}

function applyRegexPatternObjectsByTableName(regexPatternObjects, rowAsCsv, row, tableName) {
let modifiedRowAsCsv;
let modifiedRow = JSON.parse(JSON.stringify(row));
function applyRegexPatternObjectsByTableName(regexPatternObjects, keys, row, tableName) {
const rowStringified = String(row);
let modifiedRowStringified = rowStringified;

regexPatternObjects.forEach(({ regex, pattern }) => {
modifiedRowAsCsv = rowAsCsv.replace(regex, pattern || '');
modifiedRowStringified = rowStringified.replace(regex, pattern || '');

if (modifiedRowAsCsv !== rowAsCsv) {
if (modifiedRowStringified !== rowStringified) {
process.notices.addInfo(
'Applying Changes on Raw GTFS',
`Applying regex replace to table: "${tableName}". regex: "${regex}".`
);
modifiedRow = Papa.parse(modifiedRowAsCsv, {
delimiter: ',',
header: true,
});
}
});

return modifiedRow;
const parsedModifiedRow = Papa.parse(`${keys}\n${modifiedRowStringified}`, {
delimiter: ',',
});

return parsedModifiedRow.data[1];
}

function checkThatKeysIncludeIndexKeys(sortedKeys, indexKeys, tableName) {
Expand All @@ -140,19 +127,19 @@ function checkThatKeysIncludeIndexKeys(sortedKeys, indexKeys, tableName) {
if (deepness === 1 && sortedKeys.includes(indexKeys.indexKey) === false && indexKeys.indexKey !== 'agency_id') {
/* Field agency_id is optional in table agency.txt according to the specification. */
throw new Error(
`Keys of table ${tableName} do not contain the index key: ${indexKeys.indexKey}.\n` +
` The values are: ${JSON.stringify(indexKeys.indexKey)}`
`Keys of table ${tableName} do not contain the index key: ${indexKeys.indexKey}.\n`
+ ` The values are: ${JSON.stringify(indexKeys.indexKey)}`
);
}

if (
deepness === 2 &&
(sortedKeys.includes(indexKeys.firstIndexKey) === false || sortedKeys.includes(indexKeys.secondIndexKey) === false)
deepness === 2
&& (sortedKeys.includes(indexKeys.firstIndexKey) === false || sortedKeys.includes(indexKeys.secondIndexKey) === false)
) {
throw new Error(
`Keys of table ${tableName} do not contain the index keys: ` +
`${indexKeys.firstIndexKey} and ${indexKeys.secondIndexKey}.\n` +
` The values are: ${JSON.stringify(indexKeys.indexKey)}`
`Keys of table ${tableName} do not contain the index keys: `
+ `${indexKeys.firstIndexKey} and ${indexKeys.secondIndexKey}.\n`
+ ` The values are: ${JSON.stringify(indexKeys.indexKey)}`
);
}
}
Expand Down Expand Up @@ -199,7 +186,7 @@ function createGtfsClassForKeys(sortedKeys) {
return jsonObj;
};

// eslint-disable-next-line func-names
// eslint-disable-next-line func-names
GtfsRow.prototype.toJSON = function () {
return JSON.stringify(this.toSimpleObject());
};
Expand Down
2 changes: 1 addition & 1 deletion tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ describe('Tests on GTFS', () => {

// Fixes field using regexPatternObjectsByTableName
const regexPatternObjectsByTableName = new Map([[
'stops', [{ regex: /,"Some ""other"" stop",/g, pattern: ',"Some stop",' }],
'stops', [{ regex: /,Some "other" stop,/g, pattern: ',Some stop,' }],
]]);

const gtfsWithFix = new Gtfs(path, { regexPatternObjectsByTableName });
Expand Down

0 comments on commit eff587c

Please sign in to comment.