-
Notifications
You must be signed in to change notification settings - Fork 414
Strip zero-width or unprintable unicode characters #9564
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5ffe176
92f61a7
dc577ec
c7d88e3
a6aa836
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| #!/usr/bin/env bash | ||
| # | ||
| # strip_unprintable.bash - Remove invisible / unprintable characters from text files. | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| PROG=$(basename "$0") | ||
|
|
||
| # Canonical definition of what gets removed. One row per range group: | ||
| # <perl character-class fragment> <tab> <human description> | ||
| # The Perl class and the --help listing are both derived from this table, so | ||
| # adding or removing a range only needs to happen here. | ||
| ranges() { | ||
| printf '%s\t%s\n' \ | ||
| '\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}\x{007F}' 'C0 controls / DEL (TAB, LF, CR preserved)' \ | ||
| '\x{0080}-\x{009F}' 'C1 controls' \ | ||
| '\x{00A0}' 'no-break space' \ | ||
| '\x{200B}-\x{200F}' 'zero-width space/joiners, bidi marks' \ | ||
| '\x{202A}-\x{202E}' 'bidi embedding/override' \ | ||
| '\x{2060}-\x{2064}' 'word joiner, invisible operators' \ | ||
| '\x{FEFF}' 'BOM / zero-width no-break space' | ||
| } | ||
|
|
||
| # Perl character class assembled from column 1 of the ranges table. | ||
| BAD_CLASS="[$(ranges | cut -f1 | tr -d '\n')]" | ||
| PERL="${PERL:-perl}" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it safe to assume that all developer systems have
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is mostly if you have multiple perls installed (e.g. one from system and one from package manager). I seriously doubt this would be needed often, but I always like to provide such escape hatches for the "main" driver utils of scripts so that users can just do e.g. PYTHON=/my/python AWK=/my/awk PERL=/my/perl script.bash |
||
|
|
||
| usage() { | ||
| cat <<EOF | ||
| Usage: ${PROG} [--check] FILE [FILE...] | ||
|
|
||
| Remove invisible / unprintable characters from text files, in place, while | ||
| preserving ordinary whitespace (TAB U+0009, LF U+000A, CR U+000D). | ||
|
|
||
| Options: | ||
| --check Report offending files and their line/column locations; make no | ||
| edits. Exits non-zero if any unprintable characters are found. | ||
| -h, --help Show this help and exit. | ||
|
|
||
| Removed characters: | ||
| $(ranges | sed 's/^/ /') | ||
| EOF | ||
| } | ||
|
|
||
| check_only=0 | ||
|
|
||
| # Parse options. Stop at the first non-option; everything after is a file. | ||
| while [[ $# -gt 0 ]]; do | ||
| case "${1}" in | ||
| -h | --help) | ||
| usage | ||
| exit 0 | ||
| ;; | ||
| --check) | ||
| check_only=1 | ||
| shift | ||
| ;; | ||
| --) | ||
| shift | ||
| break | ||
| ;; | ||
| -*) | ||
| echo "error: unknown option: ${1}" >&2 | ||
| usage >&2 | ||
| exit 2 | ||
| ;; | ||
| *) | ||
| break | ||
| ;; | ||
| esac | ||
| done | ||
|
|
||
| if [[ $# -eq 0 ]]; then | ||
| echo "error: no files given" >&2 | ||
| usage >&2 | ||
| exit 2 | ||
| fi | ||
|
|
||
| status=0 | ||
| # Both modes process every file in a single perl process to avoid per-file interpreter | ||
| # startup overhead. In --check mode, `close ARGV if eof` resets the line counter ($.) at | ||
| # each file boundary so reported line numbers are per-file. | ||
| if [[ ${check_only} -eq 1 ]]; then | ||
| # $cls/$ARGV/$./$& are Perl variables and must stay literal inside the single quotes; | ||
| # only BAD_CLASS is interpolated, via the deliberate quote break-out. | ||
| # | ||
| # shellcheck disable=SC2016 | ||
| ${PERL} -CSD -ne ' | ||
| BEGIN { $cls = qr/'"${BAD_CLASS}"'/; } | ||
| while (/$cls/g) { | ||
| printf "%s:%d:%d: U+%04X\n", $ARGV, $., pos() - length($&) + 1, ord($&); | ||
| $found = 1; | ||
| } | ||
| close ARGV if eof; | ||
| END { exit($found ? 1 : 0); } | ||
| ' "$@" || status=1 | ||
| exit "${status}" | ||
| fi | ||
|
Jacobfaib marked this conversation as resolved.
|
||
|
|
||
| ${PERL} -CSD -i -pe 's/'"${BAD_CLASS}"'//g' "$@" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| Param( | ||
| Param( | ||
| [Parameter(Mandatory = $false)] | ||
| [Alias("std")] | ||
| [ValidateNotNullOrEmpty()] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| Param( | ||
| Param( | ||
| [Parameter(ValueFromRemainingArguments = $true)] | ||
| [string[]]$PassthroughArgs | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| Param( | ||
| Param( | ||
| [Parameter(ValueFromRemainingArguments = $true)] | ||
| [string[]]$PassthroughArgs | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| Param( | ||
| Param( | ||
| [Parameter(ValueFromRemainingArguments = $true)] | ||
| [string[]]$PassthroughArgs | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| Param( | ||
| Param( | ||
| [Parameter(ValueFromRemainingArguments = $true)] | ||
| [string[]]$PassthroughArgs | ||
| ) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.