From aafcf380a1232d47f0754e3145a772e2bf698dd7 Mon Sep 17 00:00:00 2001 From: Lucas Mas Roca <82908407+lmasroca@users.noreply.github.com> Date: Thu, 7 Aug 2025 16:33:53 -0300 Subject: [PATCH 1/6] Create test.txt --- test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 0000000000..3d18db5665 --- /dev/null +++ b/test.txt @@ -0,0 +1 @@ +test CI From 58e9f702df278c9bb7d3b411b9c181f2b2a8d5e3 Mon Sep 17 00:00:00 2001 From: Lucas Date: Thu, 14 Aug 2025 20:47:21 +0000 Subject: [PATCH 2/6] hexadecimal escapes --- .../org/evomaster/core/parser/RegexEcma262.g4 | 15 +++++++++------ .../org/evomaster/core/parser/RegexJava.g4 | 15 +++++++++------ .../core/parser/GeneRegexEcma262Visitor.kt | 19 ++++++++++++++++--- .../core/parser/GeneRegexJavaVisitor.kt | 16 ++++++++++++++-- .../parser/GeneRegexEcma262VisitorTest.kt | 5 +++++ test.txt | 1 - 6 files changed, 53 insertions(+), 18 deletions(-) delete mode 100644 test.txt diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 index 0aa6ab1ef8..9ce254370a 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 @@ -96,7 +96,7 @@ atom //TODO -//CharacterEscape +CharacterEscape: HexEscapeSequence; // : ControlEscape // | 'c' ControlLetter // | HexEscapeSequence @@ -205,7 +205,7 @@ AtomEscape : '\\' CharacterClassEscape //TODO // | '\\' DecimalEscape -// | '\\' CharacterEscape + | '\\' CharacterEscape ; fragment CharacterClassEscape @@ -239,10 +239,13 @@ BaseChar ; //TODO -//HexEscapeSequence -// : 'x' HexDigit HexDigit -// ; -// +HexEscapeSequence + : 'x' HexDigit HexDigit + ; + +fragment HexDigit: + [a-fA-F0-9] + ; //TODO //DecimalIntegerLiteral diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index f9641dbec9..4856c2989e 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -119,7 +119,7 @@ quoteChar ; //TODO -//CharacterEscape +CharacterEscape: HexEscapeSequence; // : ControlEscape // | 'c' ControlLetter // | HexEscapeSequence @@ -230,7 +230,7 @@ AtomEscape : '\\' CharacterClassEscape //TODO // | '\\' DecimalEscape -// | '\\' CharacterEscape + | '\\' CharacterEscape ; fragment CharacterClassEscape @@ -268,10 +268,13 @@ BaseChar ; //TODO -//HexEscapeSequence -// : 'x' HexDigit HexDigit -// ; -// +HexEscapeSequence + : 'x' HexDigit HexDigit + ; + +fragment HexDigit: + [a-fA-F0-9] + ; //TODO //DecimalIntegerLiteral diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index 74cb86749b..a16f911956 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -166,9 +166,22 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ return VisitResult(gene) } - if(ctx.AtomEscape() != null){ - val char = ctx.AtomEscape().text[1].toString() - return VisitResult(CharacterClassEscapeRxGene(char)) + if(ctx.AtomEscape() != null) { + val txt = ctx.AtomEscape().text + when { + txt[1] == 'x' -> { + val hexValue = + txt.subSequence(2, txt.length).toString().toInt(16) + return VisitResult( + PatternCharacterBlockGene( + txt, + hexValue.toChar().toString() + ) + ) + } + + else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString())) + } } if(ctx.disjunction() != null){ diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 2fe1081538..9e9adaf0bb 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -179,8 +179,20 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ } if(ctx.AtomEscape() != null){ - val char = ctx.AtomEscape().text[1].toString() - return VisitResult(CharacterClassEscapeRxGene(char)) + val txt = ctx.AtomEscape().text + when { + txt[1] == 'x' -> { + val hexValue = + txt.subSequence(2, txt.length).toString().toInt(16) + return VisitResult( + PatternCharacterBlockGene( + txt, + hexValue.toChar().toString() + ) + ) + } + else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString())) + } } if(ctx.disjunction() != null){ diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt index fbf487d70d..f639307e45 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt @@ -326,4 +326,9 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){ // p = 1 / 2^6 = 1 / 64 checkCanSample("^((a|A)(b|B)(c|C)123(e|E)(f|F)(d|D))$", "aBc123EFd", 10_000) } + + @Test + fun testHexEscape(){ + checkSameAsJava("""\x00\x0a\xba\xFF""") + } } \ No newline at end of file diff --git a/test.txt b/test.txt deleted file mode 100644 index 3d18db5665..0000000000 --- a/test.txt +++ /dev/null @@ -1 +0,0 @@ -test CI From b88f1d21af2d1e7c348163bf9167f18e339b0893 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Tue, 30 Sep 2025 17:56:29 -0300 Subject: [PATCH 3/6] tests for invalid escapes --- .../antlr4/org/evomaster/core/parser/RegexEcma262.g4 | 2 +- .../antlr4/org/evomaster/core/parser/RegexJava.g4 | 2 +- .../evomaster/core/parser/GeneRegexEcma262Visitor.kt | 3 ++- .../org/evomaster/core/parser/GeneRegexJavaVisitor.kt | 3 ++- .../org/evomaster/core/parser/RegexHandlerTest.kt | 11 +++++++++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 index 9ce254370a..cf59da14cc 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 @@ -22,7 +22,7 @@ grammar RegexEcma262; //------ PARSER ------------------------------ // Parser rules have first letter in lower-case -pattern : disjunction; +pattern : disjunction EOF; disjunction diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 4856c2989e..b2ff4b0179 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -23,7 +23,7 @@ grammar RegexJava; //------ PARSER ------------------------------ // Parser rules have first letter in lower-case -pattern : disjunction; +pattern : disjunction EOF; disjunction diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index a16f911956..67dac9dbbb 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -16,7 +16,8 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text") + // we remove the token from end of the string to store as sourceRegex + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length-5)}") return VisitResult(gene) } diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 9e9adaf0bb..8e1c4f3836 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -16,7 +16,8 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text") + // we remove the token from end of the string to store as sourceRegex + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length-5)}") return VisitResult(gene) } diff --git a/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt index 5fd3af1e7d..b77e39c04e 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt @@ -121,4 +121,15 @@ internal class RegexHandlerTest{ } + @Test + fun testCreateGeneForJVMInvalidRegex() { + + assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\xR") } + } + + @Test + fun testCreateGeneForEcma262InvalidRegex() { + + assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForEcma262("\\xR") } + } } \ No newline at end of file From 931a4b0f6627a32db340c027fdb643b307e02668 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Tue, 30 Sep 2025 18:56:00 -0300 Subject: [PATCH 4/6] added support for unicode escapes (e.g.: "\u0000", "\uaaaa", etc.) --- .../org/evomaster/core/parser/RegexEcma262.g4 | 13 ++++++++----- .../antlr4/org/evomaster/core/parser/RegexJava.g4 | 13 ++++++++----- .../core/parser/GeneRegexEcma262Visitor.kt | 2 +- .../evomaster/core/parser/GeneRegexJavaVisitor.kt | 2 +- .../core/parser/GeneRegexEcma262VisitorTest.kt | 5 +++++ 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 index cf59da14cc..31c720960e 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 @@ -96,13 +96,13 @@ atom //TODO -CharacterEscape: HexEscapeSequence; +CharacterEscape // : ControlEscape // | 'c' ControlLetter -// | HexEscapeSequence -// | UnicodeEscapeSequence + : HexEscapeSequence + | UnicodeEscapeSequence //| IdentityEscape -// ; + ; //TODO //ControlEscape @@ -238,7 +238,10 @@ BaseChar : ~[0-9,^$\\.*+?()[\]{}|-] ; -//TODO +UnicodeEscapeSequence + : 'u' HexDigit HexDigit HexDigit HexDigit + ; + HexEscapeSequence : 'x' HexDigit HexDigit ; diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index b2ff4b0179..47eb5a2296 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -119,13 +119,13 @@ quoteChar ; //TODO -CharacterEscape: HexEscapeSequence; +CharacterEscape // : ControlEscape // | 'c' ControlLetter -// | HexEscapeSequence -// | UnicodeEscapeSequence + : HexEscapeSequence + | UnicodeEscapeSequence //| IdentityEscape -// ; + ; //TODO //ControlEscape @@ -267,7 +267,10 @@ BaseChar : ~[0-9,^$\\.*+?()[\]{}|-] ; -//TODO +UnicodeEscapeSequence: + 'u' HexDigit HexDigit HexDigit HexDigit +; + HexEscapeSequence : 'x' HexDigit HexDigit ; diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index 67dac9dbbb..b98e26fa77 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -170,7 +170,7 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ if(ctx.AtomEscape() != null) { val txt = ctx.AtomEscape().text when { - txt[1] == 'x' -> { + txt[1] == 'x' || txt[1] == 'u' -> { val hexValue = txt.subSequence(2, txt.length).toString().toInt(16) return VisitResult( diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 8e1c4f3836..b047d89239 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -182,7 +182,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ if(ctx.AtomEscape() != null){ val txt = ctx.AtomEscape().text when { - txt[1] == 'x' -> { + txt[1] == 'x' || txt[1] == 'u' -> { val hexValue = txt.subSequence(2, txt.length).toString().toInt(16) return VisitResult( diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt index f639307e45..2a906745c9 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt @@ -331,4 +331,9 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){ fun testHexEscape(){ checkSameAsJava("""\x00\x0a\xba\xFF""") } + + @Test + fun testUnicodeEscape(){ + checkSameAsJava("""\u0000\u0a0b\uffff""") + } } \ No newline at end of file From 04ff0311fd0396f027fa2ccef5d2ffa182b7f92e Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 3 Oct 2025 01:57:19 +0000 Subject: [PATCH 5/6] small refactor --- .../kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt | 2 +- .../kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index b98e26fa77..79d6cdbd7c 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -17,7 +17,7 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) // we remove the token from end of the string to store as sourceRegex - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length-5)}") + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length - "".length)}") return VisitResult(gene) } diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index b047d89239..d81b1c255a 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -17,7 +17,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) // we remove the token from end of the string to store as sourceRegex - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length-5)}") + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length - "".length)}") return VisitResult(gene) } From 2305c164a631f712b84ebb48acd13f47c3988880 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 3 Oct 2025 13:40:13 +0000 Subject: [PATCH 6/6] small refactor --- .../org/evomaster/core/parser/GeneRegexEcma262Visitor.kt | 3 ++- .../kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index 79d6cdbd7c..3cdb7786cb 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -2,6 +2,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.regex.* +private const val EOF_TOKEN = "" /** * Parser Visitor based on the RegexEcma262.g4 grammar file */ @@ -17,7 +18,7 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) // we remove the token from end of the string to store as sourceRegex - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length - "".length)}") + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length - EOF_TOKEN.length)}") return VisitResult(gene) } diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index d81b1c255a..bd25b11714 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -2,6 +2,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.regex.* +private const val EOF_TOKEN = "" /** * Created by arcuri82 on 11-Sep-19. */ @@ -17,7 +18,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) // we remove the token from end of the string to store as sourceRegex - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length - "".length)}") + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length - EOF_TOKEN.length)}") return VisitResult(gene) }