diff options
author | Simon Rettberg | 2020-07-30 17:21:29 +0200 |
---|---|---|
committer | Simon Rettberg | 2020-07-31 12:05:13 +0200 |
commit | 4e31f99e098568c5d133125630533dad91d07348 (patch) | |
tree | d6302d3d526a79f64c0b081581679f928f55da54 /inc | |
parent | [exams] More sanity checks regarding date/time and locations (diff) | |
download | slx-admin-4e31f99e098568c5d133125630533dad91d07348.tar.gz slx-admin-4e31f99e098568c5d133125630533dad91d07348.tar.xz slx-admin-4e31f99e098568c5d133125630533dad91d07348.zip |
[serversetup-bwlp-ipxe/statistics] Sanitize strings from clients
Some string from the clients might not be well suited for utf-8
representation. Add wrapper functions that clean utf-8 strings, or
convert ANSI strings to UTF-8 while removing problematic chars.
Diffstat (limited to 'inc')
-rw-r--r-- | inc/util.inc.php | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/inc/util.inc.php b/inc/util.inc.php index 6be06bf6..83b2d54a 100644 --- a/inc/util.inc.php +++ b/inc/util.inc.php @@ -559,4 +559,37 @@ SADFACE; } } + /** + * Remove any non-utf8 sequences from string. + */ + public static function cleanUtf8(string $string) : string + { + // https://stackoverflow.com/a/1401716/2043481 + $regex = '/ + ( + (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 + ){1,100} # ...one or more times + ) + | . # anything else + /x'; + return preg_replace($regex, '$1', $string); + } + + /** + * Remove non-printable < 0x20 chars from ANSI string, then convert to UTF-8 + */ + public static function ansiToUtf8(string $string) : string + { + $regex = '/ + ( + (?: [\x20-\xFF] ){1,100} # ignore lower non-printable range + ) + | . # anything else + /x'; + return iconv('MS-ANSI', 'UTF-8', preg_replace($regex, '$1', $string)); + } + } |