From bc4e860af5ff2582ce17b1a68e8ae88f85f88b9a Mon Sep 17 00:00:00 2001 From: Nick Gasson Date: Sun, 19 Mar 2023 14:56:47 +0000 Subject: [PATCH] Warn on possible multi-byte UTF-8 characters in input --- src/lexer.l | 18 ++++++++++++++++++ test/charset/utf8.vhd | 4 ++++ test/test_charset.c | 22 ++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 test/charset/utf8.vhd diff --git a/src/lexer.l b/src/lexer.l index 48cb0491..3d8dd4d5 100644 --- a/src/lexer.l +++ b/src/lexer.l @@ -78,6 +78,7 @@ static int parse_decimal_literal(const char *str); static int parse_based_literal(const char *str); static int resolve_ir1045(void); static void warn_lrm(vhdl_standard_t std, const char *fmt, ...); +static void warn_utf8(const char *str); static bool begin_psl_comment(void); static int last_token = -1; @@ -112,6 +113,7 @@ COVERAGE_OFF {PRAGMA}(?i:coverage)[ \t]+(?i:off).* COVERAGE_ON {PRAGMA}(?i:coverage)[ \t]+(?i:on).* PSL_COMMENT {PRAGMA}(?i:psl)[ \t]+ PSL_CONT ^{SPACE}*({PSL_COMMENT}|"--") +UTF8_MB [\x80-\xff][\x80-\xbf]{1,3} %x COMMENT C_COMMENT PSL VLOG @@ -492,6 +494,8 @@ COVER ?i:cover {SEVERITY} { TOKEN(tSEVERITY); } {REPORT} { TOKEN(tREPORT); } +{UTF8_MB} { warn_utf8(yytext); REJECT; } + {VHDL_ID} { return parse_id(yytext); } {EXID} { return parse_ex_id(yytext); } <*>{SPACE} { } @@ -726,6 +730,20 @@ static void warn_lrm(vhdl_standard_t std, const char *fmt, ...) va_end(ap); } +static void warn_utf8(const char *text) +{ + static bool warned = false; + + if (!warned) { + diag_t *d = diag_new(DIAG_WARN, &yylloc); + diag_printf(d, "possible multi-byte UTF-8 character found in input"); + diag_hint(d, NULL, "the native encoding of VHDL is ISO-8859-1"); + diag_emit(d); + + warned = true; + } +} + void reset_scanner(void) { YY_FLUSH_BUFFER; diff --git a/test/charset/utf8.vhd b/test/charset/utf8.vhd new file mode 100644 index 00000000..19c6c874 --- /dev/null +++ b/test/charset/utf8.vhd @@ -0,0 +1,4 @@ +-- -*- coding: utf-8 -*- +package test is + constant Åxyzß : bit := '1'; -- Warning +end package; diff --git a/test/test_charset.c b/test/test_charset.c index adabef4b..cd1217c0 100644 --- a/test/test_charset.c +++ b/test/test_charset.c @@ -54,12 +54,34 @@ START_TEST(test_iso88591) } END_TEST +START_TEST(test_utf8) +{ + input_from_file(TESTDIR "/charset/utf8.vhd"); + + const error_t expect[] = { + { 3, "possible multi-byte UTF-8 character found in input" }, + { 3, "unexpected error while parsing constant declaration" }, + { -1, NULL } + }; + expect_errors(expect); + + tree_t p = parse(); + fail_if(p == NULL); + fail_unless(tree_kind(p) == T_PACKAGE); + + fail_unless(parse() == NULL); + + check_expected_errors(); +} +END_TEST + Suite *get_charset_tests(void) { Suite *s = suite_create("charset"); TCase *tc = nvc_unit_test(); tcase_add_test(tc, test_iso88591); + tcase_add_test(tc, test_utf8); suite_add_tcase(s, tc); return s; -- 2.39.2