Better handling of special characters

2025-11-18 11:44:07 +01:00 · 2025-06-13 08:09:54 +02:00
parent a0bd8e2f64
commit 4a5e1a3250
7 changed files with 322 additions and 64 deletions
--- a/package.json
+++ b/package.json
@@ -83,7 +83,7 @@
    "@vitest/spy": "^3.0.6",
    "@vitest/ui": "^3.0.6",
    "ajv": "^8.17.1",
-    "chokidar": "^4.0.3",
+    "chokidar": "3.6.0",
    "concurrently": "^9.1.2",
    "cors": "^2.8.5",
    "cpy-cli": "^5.0.0",
--- a/packages/mermaid/package.json
+++ b/packages/mermaid/package.json
@@ -106,7 +106,7 @@
    "@types/stylis": "^4.2.7",
    "@types/uuid": "^10.0.0",
    "ajv": "^8.17.1",
-    "chokidar": "^4.0.3",
+    "chokidar": "3.6.0",
    "concurrently": "^9.1.2",
    "csstree-validator": "^4.0.1",
    "globby": "^14.0.2",
--- a/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts
+++ b/packages/mermaid/src/diagrams/flowchart/parser/flowAst.ts
@@ -375,7 +375,7 @@ export class FlowchartAstVisitor extends BaseVisitor {
      if (nodeId.startsWith(keyword)) {
        // Allow if the keyword is not followed by a delimiter (e.g., "endpoint" is OK, "end.node" is not)
        const afterKeyword = nodeId.substring(keyword.length);
-        if (afterKeyword.length === 0 || /^[./\-]/.test(afterKeyword)) {
+        if (afterKeyword.length === 0 || /^[./-]/.test(afterKeyword)) {
          throw new Error(`Node ID cannot start with reserved keyword: ${keyword}`);
        }
      }
@@ -835,6 +835,10 @@ export class FlowchartAstVisitor extends BaseVisitor {
    const endNodeId = this.visit(ctx.nodeId[1]);
    const linkData = this.visit(ctx.link);
    // Ensure both start and end nodes exist as vertices
    this.ensureVertex(startNodeId);
    this.ensureVertex(endNodeId);
    const edge: any = {
      start: startNodeId,
      end: endNodeId,
@@ -850,6 +854,17 @@ export class FlowchartAstVisitor extends BaseVisitor {
    this.edges.push(edge);
  }
  // Helper method to ensure a vertex exists
  private ensureVertex(nodeId: string): void {
    if (!this.vertices[nodeId]) {
      this.vertices[nodeId] = {
        id: nodeId,
        text: nodeId,
        type: 'default',
      };
    }
  }
  // Missing visitor methods
  linkStyleStatement(_ctx: any): void {
    // Handle link style statements
--- a/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts
+++ b/packages/mermaid/src/diagrams/flowchart/parser/flowLexer.ts
@@ -55,12 +55,12 @@ const EOF = createToken({
 // Modified to include special characters and handle minus character edge cases
 // Allows - in node IDs including standalone -, -at-start, and -at-end patterns
 // Avoids conflicts with link tokens by using negative lookahead for link patterns
-// Handles compound cases like &node, -node, vnode where special chars are followed by word chars
+// Handles compound cases like &node, -node, vnode where special chars are followed by word chars // cspell:disable-line
-// Only matches compound patterns (special char + word chars), not standalone special chars
+// Complex pattern to handle all edge cases including punctuation at start/end
 const NODE_STRING = createToken({
  name: 'NODE_STRING',
  pattern:
-    /\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//,
+    /\\\w+|\w+\\|&[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|-[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[<>^v][\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|:[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|,[\w!"#$%&'*+,./:?\\`]+[\w!"#$%&'*+,./:?\\`-]*|[\w!"#$%&'*+,./:?\\`](?:[\w!"#$%&'*+,./:?\\`]|-(?![.=-])|\.(?!-))*[\w!"#$%&'*+,./:?\\`-]|[\w!"#$%&'*+,./:?\\`]|&|-|\\|\//,
 });
 // ============================================================================
@@ -146,6 +146,7 @@ const Default = createToken({
 const DirectionValue = createToken({
  name: 'DirectionValue',
  pattern: /LR|RL|TB|BT|TD|BR|<|>|\^|v/,
  longer_alt: NODE_STRING,
 });
 // ============================================================================
@@ -202,37 +203,46 @@ const ShapeDataStart = createToken({
 // LINK TOKENS (JISON lines 154-164)
 // ============================================================================
 // Regular links without text
 const LINK = createToken({
  name: 'LINK',
-  pattern: /\s*[<ox]?--+[>ox-]\s*/,
+  pattern: /[<ox]?--+[>ox-]/,
  longer_alt: NODE_STRING,
 });
 const START_LINK = createToken({
  name: 'START_LINK',
-  pattern: /\s*[<ox]?--\s*/,
+  pattern: /[<ox]?--/,
  push_mode: 'edgeText_mode',
  longer_alt: NODE_STRING,
 });
 // Regular thick links without text
 const THICK_LINK = createToken({
  name: 'THICK_LINK',
-  pattern: /\s*[<ox]?==+[=>ox-]?\s*/,
+  pattern: /[<ox]?==+[=>ox-]?/,
  longer_alt: NODE_STRING,
 });
 const START_THICK_LINK = createToken({
  name: 'START_THICK_LINK',
-  pattern: /\s*[<ox]?==(?=\s*\|)\s*/,
+  pattern: /[<ox]?==/,
  push_mode: 'thickEdgeText_mode',
  longer_alt: NODE_STRING,
 });
 // Regular dotted links without text
 const DOTTED_LINK = createToken({
  name: 'DOTTED_LINK',
-  pattern: /\s*[<ox]?-?\.+-[>ox-]?\s*/,
+  pattern: /[<ox]?-?\.+-[>ox-]?/,
  longer_alt: NODE_STRING,
 });
 const START_DOTTED_LINK = createToken({
  name: 'START_DOTTED_LINK',
-  pattern: /\s*[<ox]?-\.(?!-)\s*/,
+  pattern: /[<ox]?-\./,
  push_mode: 'dottedEdgeText_mode',
  longer_alt: NODE_STRING,
 });
 // ============================================================================
@@ -297,6 +307,7 @@ const Comma = createToken({
 const Pipe = createToken({
  name: 'Pipe',
  pattern: /\|/,
  push_mode: 'text_mode',
 });
 const Ampersand = createToken({
@@ -311,38 +322,38 @@ const Minus = createToken({
  longer_alt: NODE_STRING,
 });
-// Additional special character tokens for node IDs
+// Additional special character tokens for node IDs - currently unused but kept for future reference
-const Hash = createToken({
+// const Hash = createToken({
-  name: 'Hash',
+//   name: 'Hash',
-  pattern: /#/,
+//   pattern: /#/,
-  longer_alt: NODE_STRING,
+//   longer_alt: NODE_STRING,
-});
+// });
-const Asterisk = createToken({
+// const Asterisk = createToken({
-  name: 'Asterisk',
+//   name: 'Asterisk',
-  pattern: /\*/,
+//   pattern: /\*/,
-  longer_alt: NODE_STRING,
+//   longer_alt: NODE_STRING,
-});
+// });
-const Dot = createToken({
+// const Dot = createToken({
-  name: 'Dot',
+//   name: 'Dot',
-  pattern: /\./,
+//   pattern: /\./,
-  longer_alt: NODE_STRING,
+//   longer_alt: NODE_STRING,
-});
+// });
 // Backslash token removed - handled entirely by NODE_STRING
-const Slash = createToken({
+// const Slash = createToken({
-  name: 'Slash',
+//   name: 'Slash',
-  pattern: /\//,
+//   pattern: /\//,
-  longer_alt: NODE_STRING,
+//   longer_alt: NODE_STRING,
-});
+// });
-const Underscore = createToken({
+// const Underscore = createToken({
-  name: 'Underscore',
+//   name: 'Underscore',
-  pattern: /_/,
+//   pattern: /_/,
-  longer_alt: NODE_STRING,
+//   longer_alt: NODE_STRING,
-});
+// });
 const NumberToken = createToken({
  name: 'NumberToken',
@@ -452,6 +463,13 @@ const DiamondEnd = createToken({
  pop_mode: true,
 });
 // Pipe token for text mode that pops back to initial mode
 const PipeEnd = createToken({
  name: 'PipeEnd',
  pattern: /\|/,
  pop_mode: true,
 });
 // Tokens for edge text modes (JISON lines 156, 160, 164)
 const EdgeTextContent = createToken({
  name: 'EdgeTextContent',
@@ -557,12 +575,9 @@ const multiModeLexerDefinition = {
      HexagonStart,
      DiamondStart,
-      // Basic punctuation (must come before NODE_STRING)
+      // Basic punctuation (must come before NODE_STRING for proper tokenization)
      Pipe,
      Colon,
      Comma,
      Ampersand,
      Minus,
      // Node strings and numbers (must come after punctuation)
      NODE_STRING,
@@ -601,7 +616,7 @@ const multiModeLexerDefinition = {
      HexagonEnd,
      DiamondEnd,
      QuotedString,
-      Pipe, // Special handling for pipe in text mode
+      PipeEnd, // Pipe that pops back to initial mode
      TextContent,
    ],
@@ -753,6 +768,7 @@ export const allTokens = [
  Colon,
  Comma,
  Pipe,
  PipeEnd,
  Ampersand,
  Minus,
 ];
@@ -850,6 +866,7 @@ export {
  Colon,
  Comma,
  Pipe,
  PipeEnd,
  Ampersand,
  Minus,
 };
--- a/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts
+++ b/packages/mermaid/src/diagrams/flowchart/parser/flowParser.ts
@@ -55,6 +55,34 @@ export class FlowchartParser extends CstParser {
  private statement = this.RULE('statement', () => {
    this.OR([
      { ALT: () => this.SUBRULE(this.vertexStatement) },
      // Standalone link statement only when pattern is exactly nodeId link nodeId (no continuation)
      {
        ALT: () => this.SUBRULE(this.standaloneLinkStatement),
        GATE: () => {
          // Look ahead to see if this is a simple nodeId link nodeId pattern
          // without any continuation (like more links or ampersands)
          const la1 = this.LA(1); // First token (should be nodeId)
          const la2 = this.LA(2); // Second token (should be link)
          const la3 = this.LA(3); // Third token (should be nodeId)
          const la4 = this.LA(4); // Fourth token (should be separator or EOF)
          // Check if we have the exact pattern: nodeId link nodeId separator/EOF
          return (
            (la1.tokenType === tokens.NODE_STRING || la1.tokenType === tokens.NumberToken) &&
            (la2.tokenType === tokens.LINK ||
              la2.tokenType === tokens.THICK_LINK ||
              la2.tokenType === tokens.DOTTED_LINK ||
              la2.tokenType === tokens.START_LINK ||
              la2.tokenType === tokens.START_THICK_LINK ||
              la2.tokenType === tokens.START_DOTTED_LINK) &&
            (la3.tokenType === tokens.NODE_STRING || la3.tokenType === tokens.NumberToken) &&
            (la4 === undefined ||
              la4.tokenType === tokens.Semicolon ||
              la4.tokenType === tokens.Newline ||
              la4.tokenType === tokens.WhiteSpace)
          );
        },
      },
      { ALT: () => this.SUBRULE(this.styleStatement) },
      { ALT: () => this.SUBRULE(this.linkStyleStatement) },
      { ALT: () => this.SUBRULE(this.classDefStatement) },
@@ -170,7 +198,7 @@ export class FlowchartParser extends CstParser {
    // TODO: Add style separator support when implementing styling
  });
-  // Node ID - handles both simple and compound node IDs
+  // Node ID - handles both simple and special character node IDs
  private nodeId = this.RULE('nodeId', () => {
    this.OR([
      { ALT: () => this.CONSUME(tokens.NODE_STRING) },
@@ -287,11 +315,11 @@ export class FlowchartParser extends CstParser {
    });
  });
-  // Arrow text - PIPE text PIPE
+  // Arrow text - PIPE text PipeEnd
  private arrowText = this.RULE('arrowText', () => {
    this.CONSUME(tokens.Pipe);
    this.SUBRULE(this.text);
-    this.CONSUME2(tokens.Pipe);
+    this.CONSUME(tokens.PipeEnd);
  });
  // Text rule - following JISON pattern
--- a/plan.md
+++ b/plan.md
@@ -0,0 +1,212 @@
 # Chevrotain Parser Implementation Plan
 ## Current Status: 86% Complete ✅
 **Progress**: 174/203 tests passing (86% success rate)
 **Major Achievements**:
 - ✅ Fixed grammar ambiguity issues
 - ✅ Added `standaloneLinkStatement` to statement rule with proper lookahead
 - ✅ Core parser architecture is working
 - ✅ Most single node, vertex, and basic edge tests are passing
 ## Remaining Issues: 29 Tests (3 Core Problems)
 ### ✅ COMPLETED: Phase 3 - Special Characters (4 tests)
 **Status**: FIXED - All special character tests now passing
 **Solution**: Removed conflicting punctuation tokens from lexer main mode
 **Impact**: +2 tests (174/203 passing)
 ### 1. Node Creation in Edges (17 tests) - HIGH PRIORITY
 **Problem**: `Cannot read properties of undefined (reading 'id')`
 **Root Cause**: When parsing edges like `A-->B`, vertices A and B are not being created in the vertices map
 **Examples of Failing Tests**:
 - `should handle basic arrow` (`A-->B`)
 - `should handle multiple edges` (`A-->B; B-->C`)
 - `should handle chained edges` (`A-->B-->C`)
 **Solution Strategy**:
 1. **Investigate which grammar rule is actually being used** for failing tests
 2. **Add vertex creation to all edge processing paths**:
   - `standaloneLinkStatement` visitor (already has `ensureVertex()`)
   - `vertexStatement` with link chains
   - Any other edge processing methods
 3. **Test the fix incrementally** with one failing test at a time
 **Implementation Steps**:
 ```typescript
 // In flowAst.ts - ensure all edge processing creates vertices
 private ensureVertex(nodeId: string): void {
  if (!this.vertices[nodeId]) {
    this.vertices[nodeId] = {
      id: nodeId,
      text: nodeId,
      type: 'default',
    };
  }
 }
 // Add to ALL methods that process edges:
 // - standaloneLinkStatement ✅ (already done)
 // - vertexStatement (when it has link chains)
 // - linkChain processing
 // - Any other edge creation paths
 ```
 ### 2. Arrow Text Parsing (10 tests) - MEDIUM PRIORITY
 **Problem**: `Parse error: Expecting token of type --> EOF <-- but found --> '|' <--`
 **Root Cause**: Lexer not properly handling pipe character `|` in arrow text patterns like `A-->|text|B`
 **Examples of Failing Tests**:
 - `should handle arrow with text` (`A-->|text|B`)
 - `should handle edges with quoted text` (`A-->|"quoted text"|B`)
 **Solution Strategy**:
 1. **Fix lexer mode switching** for pipe characters
 2. **Follow original JISON grammar** for arrow text patterns
 3. **Implement proper tokenization** of `LINK + PIPE + text + PIPE` sequences
 **Implementation Steps**:
 ```typescript
 // In flowLexer.ts - fix pipe character handling
 // Current issue: PIPE token conflicts with text content
 // Solution: Use lexer modes or proper token precedence
 // 1. Check how JISON handles |text| patterns
 // 2. Implement similar tokenization in Chevrotain
 // 3. Ensure link text is properly captured and processed
 ```
 ### 3. Special Characters at Node Start (4 tests) - LOW PRIORITY
 **Problem**: Specific characters (`:`, `&`, `,`, `-`) at start of node IDs not being parsed
 **Root Cause**: TOKEN precedence issues where punctuation tokens override NODE_STRING
 **Examples of Failing Tests**:
 - Node IDs starting with `:`, `&`, `,`, `-`
 **Solution Strategy**:
 1. **Adjust token precedence** in lexer
 2. **Modify NODE_STRING pattern** to handle special characters
 3. **Test with each special character individually**
 ## Execution Plan
 ### Phase 1: Fix Node Creation (Target: +17 tests = 189/203 passing)
 **Timeline**: 1-2 hours
 **Priority**: HIGH - This affects the most tests
 1. **Debug which grammar rule is being used** for failing edge tests
   ```bash
   # Add logging to AST visitor methods to see which path is taken
   vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
   ```
 2. **Add vertex creation to all edge processing paths**
   - Check `vertexStatement` when it processes link chains
   - Check `linkChain` processing
   - Ensure `ensureVertex()` is called for all edge endpoints
 3. **Test incrementally**
   ```bash
   # Test one failing test at a time
   vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
   ```
 ### Phase 2: Fix Arrow Text Parsing (Target: +10 tests = 199/203 passing)
 **Timeline**: 2-3 hours
 **Priority**: MEDIUM - Complex lexer issue
 1. **Analyze original JISON grammar** for arrow text patterns
   ```bash
   # Check how flow.jison handles |text| patterns
   grep -n "EdgeText\|PIPE" packages/mermaid/src/diagrams/flowchart/parser/flow.jison
   ```
 2. **Fix lexer tokenization** for pipe characters
   - Implement proper mode switching or token precedence
   - Ensure `A-->|text|B` tokenizes as `NODE_STRING LINK PIPE TEXT PIPE NODE_STRING`
 3. **Update grammar rules** to handle arrow text
   - Ensure link rules can consume pipe-delimited text
   - Test with various text patterns (quoted, unquoted, complex)
 ### Phase 3: Fix Special Characters (Target: +4 tests = 203/203 passing)
 **Timeline**: 1 hour
 **Priority**: LOW - Affects fewest tests
 1. **Identify token conflicts** for each special character
 2. **Adjust lexer token order** or patterns
 3. **Test each character individually**
 ## Success Criteria
 ### Phase 1 Success:
 - [ ] All basic edge tests pass (`A-->B`, `A-->B-->C`, etc.)
 - [ ] Vertices are created for all edge endpoints
 - [ ] No regression in currently passing tests
 ### Phase 2 Success:
 - [ ] All arrow text tests pass (`A-->|text|B`)
 - [ ] Lexer properly tokenizes pipe-delimited text
 - [ ] Grammar correctly parses arrow text patterns
 ### Phase 3 Success:
 - [ ] All special character tests pass
 - [ ] Node IDs can start with `:`, `&`, `,`, `-`
 - [ ] No conflicts with other tokens
 ### Final Success:
 - [ ] **203/203 tests passing (100%)**
 - [ ] Full compatibility with original JISON parser
 - [ ] All existing functionality preserved
 ## Risk Mitigation
 ### High Risk: Breaking Currently Passing Tests
 **Mitigation**: Run full test suite after each change
 ```bash
 vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run
 ```
 ### Medium Risk: Lexer Changes Affecting Other Patterns
 **Mitigation**: Test with diverse input patterns, not just failing tests
 ### Low Risk: Performance Impact
 **Mitigation**: Current implementation is already efficient, changes should be minimal
 ## Tools and Commands
 ### Run Specific Test:
 ```bash
 vitest packages/mermaid/src/diagrams/flowchart/parser/flow-chev-arrows.spec.js -t "should handle basic arrow" --run
 ```
 ### Run All Chevrotain Tests:
 ```bash
 vitest packages/mermaid/src/diagrams/flowchart/parser/*flow*-chev*.spec.js --run
 ```
 ### Debug Lexer Tokenization:
 ```typescript
 // In flowParserAdapter.ts
 const lexResult = FlowChevLexer.tokenize(input);
 console.debug('Tokens:', lexResult.tokens.map(t => [t.image, t.tokenType.name]));
 console.debug('Errors:', lexResult.errors);
 ```
 ### Check Grammar Rule Usage:
 ```typescript
 // Add logging to AST visitor methods
 console.debug('Using standaloneLinkStatement for:', ctx);
 ```
 ## Next Actions
 1. **Start with Phase 1** - Fix node creation (highest impact)
 2. **Debug the exact grammar path** being taken for failing tests
 3. **Add vertex creation to all edge processing methods**
 4. **Test incrementally** to avoid regressions
 5. **Move to Phase 2** only after Phase 1 is complete
 This systematic approach ensures we fix the most impactful issues first while maintaining the stability of the 85% of tests that are already passing.
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -74,8 +74,8 @@ importers:
        specifier: ^8.17.1
        version: 8.17.1
      chokidar:
-        specifier: ^4.0.3
+        specifier: 3.6.0
-        version: 4.0.3
+        version: 3.6.0
      concurrently:
        specifier: ^9.1.2
        version: 9.1.2
@@ -330,8 +330,8 @@ importers:
        specifier: ^8.17.1
        version: 8.17.1
      chokidar:
-        specifier: ^4.0.3
+        specifier: 3.6.0
-        version: 4.0.3
+        version: 3.6.0
      concurrently:
        specifier: ^9.1.2
        version: 9.1.2
@@ -4558,10 +4558,6 @@ packages:
    resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
    engines: {node: '>= 8.10.0'}
  chokidar@4.0.3:
    resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==}
    engines: {node: '>= 14.16.0'}
  chrome-trace-event@1.0.4:
    resolution: {integrity: sha512-rNjApaLzuwaOTjCiT8lSDdGN1APCiqkChLMJxJPWLunPAt5fy8xgU9/jNOchV84wfIxrA0lRQB7oCT8jrn/wrQ==}
    engines: {node: '>=6.0'}
@@ -8430,10 +8426,6 @@ packages:
    resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
    engines: {node: '>=8.10.0'}
  readdirp@4.1.2:
    resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==}
    engines: {node: '>= 14.18.0'}
  real-require@0.2.0:
    resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==}
    engines: {node: '>= 12.13.0'}
@@ -15468,10 +15460,6 @@ snapshots:
    optionalDependencies:
      fsevents: 2.3.3
  chokidar@4.0.3:
    dependencies:
      readdirp: 4.1.2
  chrome-trace-event@1.0.4: {}
  ci-info@3.9.0: {}
@@ -20193,8 +20181,6 @@ snapshots:
    dependencies:
      picomatch: 2.3.1
  readdirp@4.1.2: {}
  real-require@0.2.0: {}
  rechoir@0.6.2: