Better handling of special characters

This commit is contained in:
Knut Sveidqvist
2025-06-13 08:09:54 +02:00
parent a0bd8e2f64
commit 4a5e1a3250
7 changed files with 322 additions and 64 deletions

View File

@@ -83,7 +83,7 @@
"@vitest/spy": "^3.0.6",
"@vitest/ui": "^3.0.6",
"ajv": "^8.17.1",
"chokidar": "^4.0.3",
"chokidar": "3.6.0",
"concurrently": "^9.1.2",
"cors": "^2.8.5",
"cpy-cli": "^5.0.0",

View File

@@ -106,7 +106,7 @@
"@types/stylis": "^4.2.7",
"@types/uuid": "^10.0.0",
"ajv": "^8.17.1",
"chokidar": "^4.0.3",
"chokidar": "3.6.0",
"concurrently": "^9.1.2",
"csstree-validator": "^4.0.1",
"globby": "^14.0.2",

View File

@@ -375,7 +375,7 @@ export class FlowchartAstVisitor extends BaseVisitor {
if (nodeId.startsWith(keyword)) {
// Allow if the keyword is not followed by a delimiter (e.g., "endpoint" is OK, "end.node" is not)
const afterKeyword = nodeId.substring(keyword.length);
if (afterKeyword.length === 0 || /^[./\-]/.test(afterKeyword)) {
if (afterKeyword.length === 0 || /^[./-]/.test(afterKeyword)) {
throw new Error(`Node ID cannot start with reserved keyword: ${keyword}`);
}
}
@@ -835,6 +835,10 @@ export class FlowchartAstVisitor extends BaseVisitor {
const endNodeId = this.visit(ctx.nodeId[1]);
const linkData = this.visit(ctx.link);
// Ensure both start and end nodes exist as vertices
this.ensureVertex(startNodeId);
this.ensureVertex(endNodeId);
const edge: any = {
start: startNodeId,
end: endNodeId,
@@ -850,6 +854,17 @@ export class FlowchartAstVisitor extends BaseVisitor {
this.edges.push(edge);
}
// Helper method to ensure a vertex exists
private ensureVertex(nodeId: string): void {
if (!this.vertices[nodeId]) {
this.vertices[nodeId] = {
id: nodeId,
text: nodeId,
type: 'default',
};
}
}
// Missing visitor methods
linkStyleStatement(_ctx: any): void {
// Handle link style statements

View File

@@ -55,12 +55,12 @@ const EOF = createToken({
// Modified to include special characters and handle minus character edge cases
// Allows - in node IDs including standalone -, -at-start, and -at-end patterns
// Avoids conflicts with link tokens by using negative lookahead for link patterns
// Handles compound cases like &node, -node, vnode where special chars are followed by word chars
// Only matches compound patterns (special char + word chars), not standalone special chars
// Handles compound cases like &node, -node, vnode where special chars are followed by word chars // cspell:disable-line
// Complex pattern to handle all edge cases including punctuation at start/end
const NODE_STRING = createToken({
name: 'NODE_STRING',
pattern:
/\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//,
/\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|:[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|,[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`-]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//,
});
// ============================================================================
@@ -146,6 +146,7 @@ const Default = createToken({
const DirectionValue = createToken({
name: 'DirectionValue',
pattern: /LR|RL|TB|BT|TD|BR|<|>|\^|v/,
longer_alt: NODE_STRING,
});
// ============================================================================
@@ -202,37 +203,46 @@ const ShapeDataStart = createToken({
// LINK TOKENS (JISON lines 154-164)
// ============================================================================
// Regular links without text
const LINK = createToken({
name: 'LINK',
pattern: /\s*[<ox]?--+[>ox-]\s*/,
pattern: /[<ox]?--+[>ox-]/,
longer_alt: NODE_STRING,
});
const START_LINK = createToken({
name: 'START_LINK',
pattern: /\s*[<ox]?--\s*/,
pattern: /[<ox]?--/,
push_mode: 'edgeText_mode',
longer_alt: NODE_STRING,
});
// Regular thick links without text
const THICK_LINK = createToken({
name: 'THICK_LINK',
pattern: /\s*[<ox]?==+[=>ox-]?\s*/,
pattern: /[<ox]?==+[=>ox-]?/,
longer_alt: NODE_STRING,
});
const START_THICK_LINK = createToken({
name: 'START_THICK_LINK',
pattern: /\s*[<ox]?==(?=\s*\|)\s*/,
pattern: /[<ox]?==/,
push_mode: 'thickEdgeText_mode',
longer_alt: NODE_STRING,
});
// Regular dotted links without text
const DOTTED_LINK = createToken({
name: 'DOTTED_LINK',
pattern: /\s*[<ox]?-?\.+-[>ox-]?\s*/,
pattern: /[<ox]?-?\.+-[>ox-]?/,
longer_alt: NODE_STRING,
});
const START_DOTTED_LINK = createToken({
name: 'START_DOTTED_LINK',
pattern: /\s*[<ox]?-\.(?!-)\s*/,
pattern: /[<ox]?-\./,
push_mode: 'dottedEdgeText_mode',
longer_alt: NODE_STRING,
});
// ============================================================================
@@ -297,6 +307,7 @@ const Comma = createToken({
const Pipe = createToken({
name: 'Pipe',
pattern: /\|/,
push_mode: 'text_mode',
});
const Ampersand = createToken({
@@ -311,38 +322,38 @@ const Minus = createToken({
longer_alt: NODE_STRING,
});
// Additional special character tokens for node IDs
const Hash = createToken({
name: 'Hash',
pattern: /#/,
longer_alt: NODE_STRING,
});
// Additional special character tokens for node IDs - currently unused but kept for future reference
// const Hash = createToken({
// name: 'Hash',
// pattern: /#/,
// longer_alt: NODE_STRING,
// });
const Asterisk = createToken({
name: 'Asterisk',
pattern: /\*/,
longer_alt: NODE_STRING,
});
// const Asterisk = createToken({
// name: 'Asterisk',
// pattern: /\*/,
// longer_alt: NODE_STRING,
// });
const Dot = createToken({
name: 'Dot',
pattern: /\./,
longer_alt: NODE_STRING,
});
// const Dot = createToken({
// name: 'Dot',
// pattern: /\./,
// longer_alt: NODE_STRING,
// });
// Backslash token removed - handled entirely by NODE_STRING
const Slash = createToken({
name: 'Slash',
pattern: /\//,
longer_alt: NODE_STRING,
});
// const Slash = createToken({
// name: 'Slash',
// pattern: /\//,
// longer_alt: NODE_STRING,
// });
const Underscore = createToken({
name: 'Underscore',
pattern: /_/,
longer_alt: NODE_STRING,
});
// const Underscore = createToken({
// name: 'Underscore',
// pattern: /_/,
// longer_alt: NODE_STRING,
// });
const NumberToken = createToken({
name: 'NumberToken',
@@ -452,6 +463,13 @@ const DiamondEnd = createToken({
pop_mode: true,
});
// Pipe token for text mode that pops back to initial mode
const PipeEnd = createToken({
name: 'PipeEnd',
pattern: /\|/,
pop_mode: true,
});
// Tokens for edge text modes (JISON lines 156, 160, 164)
const EdgeTextContent = createToken({
name: 'EdgeTextContent',
@@ -557,12 +575,9 @@ const multiModeLexerDefinition = {
HexagonStart,
DiamondStart,
// Basic punctuation (must come before NODE_STRING)
// Basic punctuation (must come before NODE_STRING for proper tokenization)
Pipe,
Colon,
Comma,
Ampersand,
Minus,
// Node strings and numbers (must come after punctuation)
NODE_STRING,
@@ -601,7 +616,7 @@ const multiModeLexerDefinition = {
HexagonEnd,
DiamondEnd,
QuotedString,
Pipe, // Special handling for pipe in text mode
PipeEnd, // Pipe that pops back to initial mode
TextContent,
],
@@ -753,6 +768,7 @@ export const allTokens = [
Colon,
Comma,
Pipe,
PipeEnd,
Ampersand,
Minus,
];
@@ -850,6 +866,7 @@ export {
Colon,
Comma,
Pipe,
PipeEnd,
Ampersand,
Minus,
};

View File

@@ -55,6 +55,34 @@ export class FlowchartParser extends CstParser {
private statement = this.RULE('statement', () => {
this.OR([
{ ALT: () => this.SUBRULE(this.vertexStatement) },
// Standalone link statement only when pattern is exactly nodeId link nodeId (no continuation)
{
ALT: () => this.SUBRULE(this.standaloneLinkStatement),
GATE: () => {
// Look ahead to see if this is a simple nodeId link nodeId pattern
// without any continuation (like more links or ampersands)
const la1 = this.LA(1); // First token (should be nodeId)
const la2 = this.LA(2); // Second token (should be link)
const la3 = this.LA(3); // Third token (should be nodeId)
const la4 = this.LA(4); // Fourth token (should be separator or EOF)
// Check if we have the exact pattern: nodeId link nodeId separator/EOF
return (
(la1.tokenType === tokens.NODE_STRING || la1.tokenType === tokens.NumberToken) &&
(la2.tokenType === tokens.LINK ||
la2.tokenType === tokens.THICK_LINK ||
la2.tokenType === tokens.DOTTED_LINK ||
la2.tokenType === tokens.START_LINK ||
la2.tokenType === tokens.START_THICK_LINK ||
la2.tokenType === tokens.START_DOTTED_LINK) &&
(la3.tokenType === tokens.NODE_STRING || la3.tokenType === tokens.NumberToken) &&
(la4 === undefined ||
la4.tokenType === tokens.Semicolon ||
la4.tokenType === tokens.Newline ||
la4.tokenType === tokens.WhiteSpace)
);
},
},
{ ALT: () => this.SUBRULE(this.styleStatement) },
{ ALT: () => this.SUBRULE(this.linkStyleStatement) },
{ ALT: () => this.SUBRULE(this.classDefStatement) },
@@ -170,7 +198,7 @@ export class FlowchartParser extends CstParser {
// TODO: Add style separator support when implementing styling
});
// Node ID - handles both simple and compound node IDs
// Node ID - handles both simple and special character node IDs
private nodeId = this.RULE('nodeId', () => {
this.OR([
{ ALT: () => this.CONSUME(tokens.NODE_STRING) },
@@ -287,11 +315,11 @@ export class FlowchartParser extends CstParser {
});
});
// Arrow text - PIPE text PIPE
// Arrow text - PIPE text PipeEnd
private arrowText = this.RULE('arrowText', () => {
this.CONSUME(tokens.Pipe);
this.SUBRULE(this.text);
this.CONSUME2(tokens.Pipe);
this.CONSUME(tokens.PipeEnd);
});
// Text rule - following JISON pattern

212
plan.md Normal file
View File

@@ -0,0 +1,212 @@
# Chevrotain Parser Implementation Plan
## Current Status: 86% Complete ✅
**Progress**: 174/203 tests passing (86% success rate)
**Major Achievements**:
- ✅ Fixed grammar ambiguity issues
- ✅ Added `standaloneLinkStatement` to statement rule with proper lookahead
- ✅ Core parser architecture is working
- ✅ Most single node, vertex, and basic edge tests are passing
## Remaining Issues: 29 Tests (3 Core Problems)
### ✅ COMPLETED: Phase 3 - Special Characters (4 tests)
**Status**: FIXED - All special character tests now passing
**Solution**: Removed conflicting punctuation tokens from lexer main mode
**Impact**: +2 tests (174/203 passing)
### 1. Node Creation in Edges (17 tests) - HIGH PRIORITY
**Problem**: `Cannot read properties of undefined (reading 'id')`
**Root Cause**: When parsing edges like `A-->B`, vertices A and B are not being created in the vertices map
**Examples of Failing Tests**:
- `should handle basic arrow` (`A-->B`)
- `should handle multiple edges` (`A-->B; B-->C`)
- `should handle chained edges` (`A-->B-->C`)
**Solution Strategy**:
1. **Investigate which grammar rule is actually being used** for failing tests
2. **Add vertex creation to all edge processing paths**:
- `standaloneLinkStatement` visitor (already has `ensureVertex()`)
- `vertexStatement` with link chains
- Any other edge processing methods
3. **Test the fix incrementally** with one failing test at a time
**Implementation Steps**:
```typescript
// In flowAst.ts - ensure all edge processing creates vertices
private ensureVertex(nodeId: string): void {
if (!this.vertices[nodeId]) {
this.vertices[nodeId] = {
id: nodeId,
text: nodeId,
type: 'default',
};
}
}
// Add to ALL methods that process edges:
// - standaloneLinkStatement ✅ (already done)
// - vertexStatement (when it has link chains)
// - linkChain processing
// - Any other edge creation paths
```
### 2. Arrow Text Parsing (10 tests) - MEDIUM PRIORITY
**Problem**: `Parse error: Expecting token of type --> EOF <-- but found --> '|' <--`
**Root Cause**: Lexer not properly handling pipe character `|` in arrow text patterns like `A-->|text|B`
**Examples of Failing Tests**:
- `should handle arrow with text` (`A-->|text|B`)
- `should handle edges with quoted text` (`A-->|"quoted text"|B`)
**Solution Strategy**:
1. **Fix lexer mode switching** for pipe characters
2. **Follow original JISON grammar** for arrow text patterns
3. **Implement proper tokenization** of `LINK + PIPE + text + PIPE` sequences
**Implementation Steps**:
```typescript
// In flowLexer.ts - fix pipe character handling
// Current issue: PIPE token conflicts with text content
// Solution: Use lexer modes or proper token precedence
// 1. Check how JISON handles |text| patterns
// 2. Implement similar tokenization in Chevrotain
// 3. Ensure link text is properly captured and processed
```
### 3. Special Characters at Node Start (4 tests) - LOW PRIORITY
**Problem**: Specific characters (`:`, `&`, `,`, `-`) at start of node IDs not being parsed
**Root Cause**: TOKEN precedence issues where punctuation tokens override NODE_STRING
**Examples of Failing Tests**:
- Node IDs starting with `:`, `&`, `,`, `-`
**Solution Strategy**:
1. **Adjust token precedence** in lexer
2. **Modify NODE_STRING pattern** to handle special characters
3. **Test with each special character individually**
## Execution Plan
### Phase 1: Fix Node Creation (Target: +17 tests = 189/203 passing)
**Timeline**: 1-2 hours
**Priority**: HIGH - This affects the most tests
1. **Debug which grammar rule is being used** for failing edge tests
```bash
# Add logging to AST visitor methods to see which path is taken
vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
```
2. **Add vertex creation to all edge processing paths**
- Check `vertexStatement` when it processes link chains
- Check `linkChain` processing
- Ensure `ensureVertex()` is called for all edge endpoints
3. **Test incrementally**
```bash
# Test one failing test at a time
vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
```
### Phase 2: Fix Arrow Text Parsing (Target: +10 tests = 199/203 passing)
**Timeline**: 2-3 hours
**Priority**: MEDIUM - Complex lexer issue
1. **Analyze original JISON grammar** for arrow text patterns
```bash
# Check how flow.jison handles |text| patterns
grep -n "EdgeText\|PIPE" packages/mermaid/src/diagrams/flowchart/parser/flow.jison
```
2. **Fix lexer tokenization** for pipe characters
- Implement proper mode switching or token precedence
- Ensure `A-->|text|B` tokenizes as `NODE_STRING LINK PIPE TEXT PIPE NODE_STRING`
3. **Update grammar rules** to handle arrow text
- Ensure link rules can consume pipe-delimited text
- Test with various text patterns (quoted, unquoted, complex)
### Phase 3: Fix Special Characters (Target: +4 tests = 203/203 passing)
**Timeline**: 1 hour
**Priority**: LOW - Affects fewest tests
1. **Identify token conflicts** for each special character
2. **Adjust lexer token order** or patterns
3. **Test each character individually**
## Success Criteria
### Phase 1 Success:
- [ ] All basic edge tests pass (`A-->B`, `A-->B-->C`, etc.)
- [ ] Vertices are created for all edge endpoints
- [ ] No regression in currently passing tests
### Phase 2 Success:
- [ ] All arrow text tests pass (`A-->|text|B`)
- [ ] Lexer properly tokenizes pipe-delimited text
- [ ] Grammar correctly parses arrow text patterns
### Phase 3 Success:
- [ ] All special character tests pass
- [ ] Node IDs can start with `:`, `&`, `,`, `-`
- [ ] No conflicts with other tokens
### Final Success:
- [ ] **203/203 tests passing (100%)**
- [ ] Full compatibility with original JISON parser
- [ ] All existing functionality preserved
## Risk Mitigation
### High Risk: Breaking Currently Passing Tests
**Mitigation**: Run full test suite after each change
```bash
vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run
```
### Medium Risk: Lexer Changes Affecting Other Patterns
**Mitigation**: Test with diverse input patterns, not just failing tests
### Low Risk: Performance Impact
**Mitigation**: Current implementation is already efficient, changes should be minimal
## Tools and Commands
### Run Specific Test:
```bash
vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
```
### Run All Chevrotain Tests:
```bash
vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run
```
### Debug Lexer Tokenization:
```typescript
// In flowParserAdapter.ts
const lexResult = FlowChevLexer.tokenize(input);
console.debug('Tokens:', lexResult.tokens.map(t => [t.image, t.tokenType.name]));
console.debug('Errors:', lexResult.errors);
```
### Check Grammar Rule Usage:
```typescript
// Add logging to AST visitor methods
console.debug('Using standaloneLinkStatement for:', ctx);
```
## Next Actions
1. **Start with Phase 1** - Fix node creation (highest impact)
2. **Debug the exact grammar path** being taken for failing tests
3. **Add vertex creation to all edge processing methods**
4. **Test incrementally** to avoid regressions
5. **Move to Phase 2** only after Phase 1 is complete
This systematic approach ensures we fix the most impactful issues first while maintaining the stability of the 85% of tests that are already passing.

22
pnpm-lock.yaml generated
View File

@@ -74,8 +74,8 @@ importers:
specifier: ^8.17.1
version: 8.17.1
chokidar:
specifier: ^4.0.3
version: 4.0.3
specifier: 3.6.0
version: 3.6.0
concurrently:
specifier: ^9.1.2
version: 9.1.2
@@ -330,8 +330,8 @@ importers:
specifier: ^8.17.1
version: 8.17.1
chokidar:
specifier: ^4.0.3
version: 4.0.3
specifier: 3.6.0
version: 3.6.0
concurrently:
specifier: ^9.1.2
version: 9.1.2
@@ -4558,10 +4558,6 @@ packages:
resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
engines: {node: '>= 8.10.0'}
chokidar@4.0.3:
resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==}
engines: {node: '>= 14.16.0'}
chrome-trace-event@1.0.4:
resolution: {integrity: sha512-rNjApaLzuwaOTjCiT8lSDdGN1APCiqkChLMJxJPWLunPAt5fy8xgU9/jNOchV84wfIxrA0lRQB7oCT8jrn/wrQ==}
engines: {node: '>=6.0'}
@@ -8430,10 +8426,6 @@ packages:
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
engines: {node: '>=8.10.0'}
readdirp@4.1.2:
resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==}
engines: {node: '>= 14.18.0'}
real-require@0.2.0:
resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==}
engines: {node: '>= 12.13.0'}
@@ -15468,10 +15460,6 @@ snapshots:
optionalDependencies:
fsevents: 2.3.3
chokidar@4.0.3:
dependencies:
readdirp: 4.1.2
chrome-trace-event@1.0.4: {}
ci-info@3.9.0: {}
@@ -20193,8 +20181,6 @@ snapshots:
dependencies:
picomatch: 2.3.1
readdirp@4.1.2: {}
real-require@0.2.0: {}
rechoir@0.6.2: