From 4a5e1a3250edfc9c55f039bfbdc32bcb034ede91 Mon Sep 17 00:00:00 2001 From: Knut Sveidqvist Date: Fri, 13 Jun 2025 08:09:54 +0200 Subject: [PATCH] Better handling of special characters --- package.json | 2 +- packages/mermaid/package.json | 2 +- .../src/diagrams/flowchart/parser/flowAst.ts | 17 +- .../diagrams/flowchart/parser/flowLexer.ts | 97 ++++---- .../diagrams/flowchart/parser/flowParser.ts | 34 ++- plan.md | 212 ++++++++++++++++++ pnpm-lock.yaml | 22 +- 7 files changed, 322 insertions(+), 64 deletions(-) create mode 100644 plan.md diff --git a/package.json b/package.json index fa189f008..75d577b86 100644 --- a/package.json +++ b/package.json @@ -83,7 +83,7 @@ "@vitest/spy": "^3.0.6", "@vitest/ui": "^3.0.6", "ajv": "^8.17.1", - "chokidar": "^4.0.3", + "chokidar": "3.6.0", "concurrently": "^9.1.2", "cors": "^2.8.5", "cpy-cli": "^5.0.0", diff --git a/packages/mermaid/package.json b/packages/mermaid/package.json index 3ee766d02..23145c6de 100644 --- a/packages/mermaid/package.json +++ b/packages/mermaid/package.json @@ -106,7 +106,7 @@ "@types/stylis": "^4.2.7", "@types/uuid": "^10.0.0", "ajv": "^8.17.1", - "chokidar": "^4.0.3", + "chokidar": "3.6.0", "concurrently": "^9.1.2", "csstree-validator": "^4.0.1", "globby": "^14.0.2", diff --git a/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts b/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts index 2750feb40..f03fbe544 100644 --- a/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts +++ b/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts @@ -375,7 +375,7 @@ export class FlowchartAstVisitor extends BaseVisitor { if (nodeId.startsWith(keyword)) { // Allow if the keyword is not followed by a delimiter (e.g., "endpoint" is OK, "end.node" is not) const afterKeyword = nodeId.substring(keyword.length); - if (afterKeyword.length === 0 || /^[./\-]/.test(afterKeyword)) { + if (afterKeyword.length === 0 || /^[./-]/.test(afterKeyword)) { throw new Error(`Node ID cannot start with reserved keyword: ${keyword}`); } } @@ -835,6 +835,10 @@ export class FlowchartAstVisitor extends BaseVisitor { const endNodeId = this.visit(ctx.nodeId[1]); const linkData = this.visit(ctx.link); + // Ensure both start and end nodes exist as vertices + this.ensureVertex(startNodeId); + this.ensureVertex(endNodeId); + const edge: any = { start: startNodeId, end: endNodeId, @@ -850,6 +854,17 @@ export class FlowchartAstVisitor extends BaseVisitor { this.edges.push(edge); } + // Helper method to ensure a vertex exists + private ensureVertex(nodeId: string): void { + if (!this.vertices[nodeId]) { + this.vertices[nodeId] = { + id: nodeId, + text: nodeId, + type: 'default', + }; + } + } + // Missing visitor methods linkStyleStatement(_ctx: any): void { // Handle link style statements diff --git a/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts b/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts index 9d5ecd93b..491d34f16 100644 --- a/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts +++ b/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts @@ -55,12 +55,12 @@ const EOF = createToken({ // Modified to include special characters and handle minus character edge cases // Allows - in node IDs including standalone -, -at-start, and -at-end patterns // Avoids conflicts with link tokens by using negative lookahead for link patterns -// Handles compound cases like &node, -node, vnode where special chars are followed by word chars -// Only matches compound patterns (special char + word chars), not standalone special chars +// Handles compound cases like &node, -node, vnode where special chars are followed by word chars // cspell:disable-line +// Complex pattern to handle all edge cases including punctuation at start/end const NODE_STRING = createToken({ name: 'NODE_STRING', pattern: - /\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//, + /\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|:[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|,[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`-]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//, }); // ============================================================================ @@ -146,6 +146,7 @@ const Default = createToken({ const DirectionValue = createToken({ name: 'DirectionValue', pattern: /LR|RL|TB|BT|TD|BR|<|>|\^|v/, + longer_alt: NODE_STRING, }); // ============================================================================ @@ -202,37 +203,46 @@ const ShapeDataStart = createToken({ // LINK TOKENS (JISON lines 154-164) // ============================================================================ +// Regular links without text const LINK = createToken({ name: 'LINK', - pattern: /\s*[ox-]\s*/, + pattern: /[ox-]/, + longer_alt: NODE_STRING, }); const START_LINK = createToken({ name: 'START_LINK', - pattern: /\s*[ox-]?\s*/, + pattern: /[ox-]?/, + longer_alt: NODE_STRING, }); const START_THICK_LINK = createToken({ name: 'START_THICK_LINK', - pattern: /\s*[ox-]?\s*/, + pattern: /[ox-]?/, + longer_alt: NODE_STRING, }); const START_DOTTED_LINK = createToken({ name: 'START_DOTTED_LINK', - pattern: /\s*[ { this.OR([ { ALT: () => this.SUBRULE(this.vertexStatement) }, + // Standalone link statement only when pattern is exactly nodeId link nodeId (no continuation) + { + ALT: () => this.SUBRULE(this.standaloneLinkStatement), + GATE: () => { + // Look ahead to see if this is a simple nodeId link nodeId pattern + // without any continuation (like more links or ampersands) + const la1 = this.LA(1); // First token (should be nodeId) + const la2 = this.LA(2); // Second token (should be link) + const la3 = this.LA(3); // Third token (should be nodeId) + const la4 = this.LA(4); // Fourth token (should be separator or EOF) + + // Check if we have the exact pattern: nodeId link nodeId separator/EOF + return ( + (la1.tokenType === tokens.NODE_STRING || la1.tokenType === tokens.NumberToken) && + (la2.tokenType === tokens.LINK || + la2.tokenType === tokens.THICK_LINK || + la2.tokenType === tokens.DOTTED_LINK || + la2.tokenType === tokens.START_LINK || + la2.tokenType === tokens.START_THICK_LINK || + la2.tokenType === tokens.START_DOTTED_LINK) && + (la3.tokenType === tokens.NODE_STRING || la3.tokenType === tokens.NumberToken) && + (la4 === undefined || + la4.tokenType === tokens.Semicolon || + la4.tokenType === tokens.Newline || + la4.tokenType === tokens.WhiteSpace) + ); + }, + }, { ALT: () => this.SUBRULE(this.styleStatement) }, { ALT: () => this.SUBRULE(this.linkStyleStatement) }, { ALT: () => this.SUBRULE(this.classDefStatement) }, @@ -170,7 +198,7 @@ export class FlowchartParser extends CstParser { // TODO: Add style separator support when implementing styling }); - // Node ID - handles both simple and compound node IDs + // Node ID - handles both simple and special character node IDs private nodeId = this.RULE('nodeId', () => { this.OR([ { ALT: () => this.CONSUME(tokens.NODE_STRING) }, @@ -287,11 +315,11 @@ export class FlowchartParser extends CstParser { }); }); - // Arrow text - PIPE text PIPE + // Arrow text - PIPE text PipeEnd private arrowText = this.RULE('arrowText', () => { this.CONSUME(tokens.Pipe); this.SUBRULE(this.text); - this.CONSUME2(tokens.Pipe); + this.CONSUME(tokens.PipeEnd); }); // Text rule - following JISON pattern diff --git a/plan.md b/plan.md new file mode 100644 index 000000000..27d3481b8 --- /dev/null +++ b/plan.md @@ -0,0 +1,212 @@ +# Chevrotain Parser Implementation Plan + +## Current Status: 86% Complete ✅ + +**Progress**: 174/203 tests passing (86% success rate) + +**Major Achievements**: +- ✅ Fixed grammar ambiguity issues +- ✅ Added `standaloneLinkStatement` to statement rule with proper lookahead +- ✅ Core parser architecture is working +- ✅ Most single node, vertex, and basic edge tests are passing + +## Remaining Issues: 29 Tests (3 Core Problems) + +### ✅ COMPLETED: Phase 3 - Special Characters (4 tests) +**Status**: FIXED - All special character tests now passing +**Solution**: Removed conflicting punctuation tokens from lexer main mode +**Impact**: +2 tests (174/203 passing) + +### 1. Node Creation in Edges (17 tests) - HIGH PRIORITY +**Problem**: `Cannot read properties of undefined (reading 'id')` +**Root Cause**: When parsing edges like `A-->B`, vertices A and B are not being created in the vertices map + +**Examples of Failing Tests**: +- `should handle basic arrow` (`A-->B`) +- `should handle multiple edges` (`A-->B; B-->C`) +- `should handle chained edges` (`A-->B-->C`) + +**Solution Strategy**: +1. **Investigate which grammar rule is actually being used** for failing tests +2. **Add vertex creation to all edge processing paths**: + - `standaloneLinkStatement` visitor (already has `ensureVertex()`) + - `vertexStatement` with link chains + - Any other edge processing methods +3. **Test the fix incrementally** with one failing test at a time + +**Implementation Steps**: +```typescript +// In flowAst.ts - ensure all edge processing creates vertices +private ensureVertex(nodeId: string): void { + if (!this.vertices[nodeId]) { + this.vertices[nodeId] = { + id: nodeId, + text: nodeId, + type: 'default', + }; + } +} + +// Add to ALL methods that process edges: +// - standaloneLinkStatement ✅ (already done) +// - vertexStatement (when it has link chains) +// - linkChain processing +// - Any other edge creation paths +``` + +### 2. Arrow Text Parsing (10 tests) - MEDIUM PRIORITY +**Problem**: `Parse error: Expecting token of type --> EOF <-- but found --> '|' <--` +**Root Cause**: Lexer not properly handling pipe character `|` in arrow text patterns like `A-->|text|B` + +**Examples of Failing Tests**: +- `should handle arrow with text` (`A-->|text|B`) +- `should handle edges with quoted text` (`A-->|"quoted text"|B`) + +**Solution Strategy**: +1. **Fix lexer mode switching** for pipe characters +2. **Follow original JISON grammar** for arrow text patterns +3. **Implement proper tokenization** of `LINK + PIPE + text + PIPE` sequences + +**Implementation Steps**: +```typescript +// In flowLexer.ts - fix pipe character handling +// Current issue: PIPE token conflicts with text content +// Solution: Use lexer modes or proper token precedence + +// 1. Check how JISON handles |text| patterns +// 2. Implement similar tokenization in Chevrotain +// 3. Ensure link text is properly captured and processed +``` + +### 3. Special Characters at Node Start (4 tests) - LOW PRIORITY +**Problem**: Specific characters (`:`, `&`, `,`, `-`) at start of node IDs not being parsed +**Root Cause**: TOKEN precedence issues where punctuation tokens override NODE_STRING + +**Examples of Failing Tests**: +- Node IDs starting with `:`, `&`, `,`, `-` + +**Solution Strategy**: +1. **Adjust token precedence** in lexer +2. **Modify NODE_STRING pattern** to handle special characters +3. **Test with each special character individually** + +## Execution Plan + +### Phase 1: Fix Node Creation (Target: +17 tests = 189/203 passing) +**Timeline**: 1-2 hours +**Priority**: HIGH - This affects the most tests + +1. **Debug which grammar rule is being used** for failing edge tests + ```bash + # Add logging to AST visitor methods to see which path is taken + vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run + ``` + +2. **Add vertex creation to all edge processing paths** + - Check `vertexStatement` when it processes link chains + - Check `linkChain` processing + - Ensure `ensureVertex()` is called for all edge endpoints + +3. **Test incrementally** + ```bash + # Test one failing test at a time + vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run + ``` + +### Phase 2: Fix Arrow Text Parsing (Target: +10 tests = 199/203 passing) +**Timeline**: 2-3 hours +**Priority**: MEDIUM - Complex lexer issue + +1. **Analyze original JISON grammar** for arrow text patterns + ```bash + # Check how flow.jison handles |text| patterns + grep -n "EdgeText\|PIPE" packages/mermaid/src/diagrams/flowchart/parser/flow.jison + ``` + +2. **Fix lexer tokenization** for pipe characters + - Implement proper mode switching or token precedence + - Ensure `A-->|text|B` tokenizes as `NODE_STRING LINK PIPE TEXT PIPE NODE_STRING` + +3. **Update grammar rules** to handle arrow text + - Ensure link rules can consume pipe-delimited text + - Test with various text patterns (quoted, unquoted, complex) + +### Phase 3: Fix Special Characters (Target: +4 tests = 203/203 passing) +**Timeline**: 1 hour +**Priority**: LOW - Affects fewest tests + +1. **Identify token conflicts** for each special character +2. **Adjust lexer token order** or patterns +3. **Test each character individually** + +## Success Criteria + +### Phase 1 Success: +- [ ] All basic edge tests pass (`A-->B`, `A-->B-->C`, etc.) +- [ ] Vertices are created for all edge endpoints +- [ ] No regression in currently passing tests + +### Phase 2 Success: +- [ ] All arrow text tests pass (`A-->|text|B`) +- [ ] Lexer properly tokenizes pipe-delimited text +- [ ] Grammar correctly parses arrow text patterns + +### Phase 3 Success: +- [ ] All special character tests pass +- [ ] Node IDs can start with `:`, `&`, `,`, `-` +- [ ] No conflicts with other tokens + +### Final Success: +- [ ] **203/203 tests passing (100%)** +- [ ] Full compatibility with original JISON parser +- [ ] All existing functionality preserved + +## Risk Mitigation + +### High Risk: Breaking Currently Passing Tests +**Mitigation**: Run full test suite after each change +```bash +vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run +``` + +### Medium Risk: Lexer Changes Affecting Other Patterns +**Mitigation**: Test with diverse input patterns, not just failing tests + +### Low Risk: Performance Impact +**Mitigation**: Current implementation is already efficient, changes should be minimal + +## Tools and Commands + +### Run Specific Test: +```bash +vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run +``` + +### Run All Chevrotain Tests: +```bash +vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run +``` + +### Debug Lexer Tokenization: +```typescript +// In flowParserAdapter.ts +const lexResult = FlowChevLexer.tokenize(input); +console.debug('Tokens:', lexResult.tokens.map(t => [t.image, t.tokenType.name])); +console.debug('Errors:', lexResult.errors); +``` + +### Check Grammar Rule Usage: +```typescript +// Add logging to AST visitor methods +console.debug('Using standaloneLinkStatement for:', ctx); +``` + +## Next Actions + +1. **Start with Phase 1** - Fix node creation (highest impact) +2. **Debug the exact grammar path** being taken for failing tests +3. **Add vertex creation to all edge processing methods** +4. **Test incrementally** to avoid regressions +5. **Move to Phase 2** only after Phase 1 is complete + +This systematic approach ensures we fix the most impactful issues first while maintaining the stability of the 85% of tests that are already passing. diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3143931fe..3ee5e27b7 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -74,8 +74,8 @@ importers: specifier: ^8.17.1 version: 8.17.1 chokidar: - specifier: ^4.0.3 - version: 4.0.3 + specifier: 3.6.0 + version: 3.6.0 concurrently: specifier: ^9.1.2 version: 9.1.2 @@ -330,8 +330,8 @@ importers: specifier: ^8.17.1 version: 8.17.1 chokidar: - specifier: ^4.0.3 - version: 4.0.3 + specifier: 3.6.0 + version: 3.6.0 concurrently: specifier: ^9.1.2 version: 9.1.2 @@ -4558,10 +4558,6 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} - chokidar@4.0.3: - resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==} - engines: {node: '>= 14.16.0'} - chrome-trace-event@1.0.4: resolution: {integrity: sha512-rNjApaLzuwaOTjCiT8lSDdGN1APCiqkChLMJxJPWLunPAt5fy8xgU9/jNOchV84wfIxrA0lRQB7oCT8jrn/wrQ==} engines: {node: '>=6.0'} @@ -8430,10 +8426,6 @@ packages: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} engines: {node: '>=8.10.0'} - readdirp@4.1.2: - resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==} - engines: {node: '>= 14.18.0'} - real-require@0.2.0: resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==} engines: {node: '>= 12.13.0'} @@ -15468,10 +15460,6 @@ snapshots: optionalDependencies: fsevents: 2.3.3 - chokidar@4.0.3: - dependencies: - readdirp: 4.1.2 - chrome-trace-event@1.0.4: {} ci-info@3.9.0: {} @@ -20193,8 +20181,6 @@ snapshots: dependencies: picomatch: 2.3.1 - readdirp@4.1.2: {} - real-require@0.2.0: {} rechoir@0.6.2: