From 4e31f99e098568c5d133125630533dad91d07348 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 30 Jul 2020 17:21:29 +0200 Subject: [serversetup-bwlp-ipxe/statistics] Sanitize strings from clients Some string from the clients might not be well suited for utf-8 representation. Add wrapper functions that clean utf-8 strings, or convert ANSI strings to UTF-8 while removing problematic chars. --- inc/util.inc.php | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'inc/util.inc.php') diff --git a/inc/util.inc.php b/inc/util.inc.php index 6be06bf6..83b2d54a 100644 --- a/inc/util.inc.php +++ b/inc/util.inc.php @@ -559,4 +559,37 @@ SADFACE; } } + /** + * Remove any non-utf8 sequences from string. + */ + public static function cleanUtf8(string $string) : string + { + // https://stackoverflow.com/a/1401716/2043481 + $regex = '/ + ( + (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 + ){1,100} # ...one or more times + ) + | . # anything else + /x'; + return preg_replace($regex, '$1', $string); + } + + /** + * Remove non-printable < 0x20 chars from ANSI string, then convert to UTF-8 + */ + public static function ansiToUtf8(string $string) : string + { + $regex = '/ + ( + (?: [\x20-\xFF] ){1,100} # ignore lower non-printable range + ) + | . # anything else + /x'; + return iconv('MS-ANSI', 'UTF-8', preg_replace($regex, '$1', $string)); + } + } -- cgit v1.2.3-55-g7522