diff options
author | Simon Rettberg | 2020-07-30 17:21:29 +0200 |
---|---|---|
committer | Simon Rettberg | 2020-07-30 17:21:29 +0200 |
commit | 71fe655d73db79fa03e17e8e414568c8ea0f5842 (patch) | |
tree | 13398be8bbdc9942461419c32eda5fda1b397c95 /inc | |
parent | [exams] More sanity checks regarding date/time and locations (diff) | |
download | slx-admin-71fe655d73db79fa03e17e8e414568c8ea0f5842.tar.gz slx-admin-71fe655d73db79fa03e17e8e414568c8ea0f5842.tar.xz slx-admin-71fe655d73db79fa03e17e8e414568c8ea0f5842.zip |
[serversetup-bwlp-ipxe/statistics] Sanitize strings from clients
Some string from the clients might not be well suited for utf-8
representation. Add wrapper functions that clean utf-8 strings, or
convert ANSI strings to UTF-8 while removing problematic chars.
Diffstat (limited to 'inc')
-rw-r--r-- | inc/util.inc.php | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/inc/util.inc.php b/inc/util.inc.php index 6be06bf6..83b2d54a 100644 --- a/inc/util.inc.php +++ b/inc/util.inc.php @@ -559,4 +559,37 @@ SADFACE; } } + /** + * Remove any non-utf8 sequences from string. + */ + public static function cleanUtf8(string $string) : string + { + // https://stackoverflow.com/a/1401716/2043481 + $regex = '/ + ( + (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 + ){1,100} # ...one or more times + ) + | . # anything else + /x'; + return preg_replace($regex, '$1', $string); + } + + /** + * Remove non-printable < 0x20 chars from ANSI string, then convert to UTF-8 + */ + public static function ansiToUtf8(string $string) : string + { + $regex = '/ + ( + (?: [\x20-\xFF] ){1,100} # ignore lower non-printable range + ) + | . # anything else + /x'; + return iconv('MS-ANSI', 'UTF-8', preg_replace($regex, '$1', $string)); + } + } |