summaryrefslogtreecommitdiffstats
path: root/inc/util.inc.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/util.inc.php')
-rw-r--r--inc/util.inc.php33
1 files changed, 33 insertions, 0 deletions
diff --git a/inc/util.inc.php b/inc/util.inc.php
index 6be06bf6..83b2d54a 100644
--- a/inc/util.inc.php
+++ b/inc/util.inc.php
@@ -559,4 +559,37 @@ SADFACE;
}
}
+ /**
+ * Remove any non-utf8 sequences from string.
+ */
+ public static function cleanUtf8(string $string) : string
+ {
+ // https://stackoverflow.com/a/1401716/2043481
+ $regex = '/
+ (
+ (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx
+ | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
+ | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2
+ | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
+ ){1,100} # ...one or more times
+ )
+ | . # anything else
+ /x';
+ return preg_replace($regex, '$1', $string);
+ }
+
+ /**
+ * Remove non-printable < 0x20 chars from ANSI string, then convert to UTF-8
+ */
+ public static function ansiToUtf8(string $string) : string
+ {
+ $regex = '/
+ (
+ (?: [\x20-\xFF] ){1,100} # ignore lower non-printable range
+ )
+ | . # anything else
+ /x';
+ return iconv('MS-ANSI', 'UTF-8', preg_replace($regex, '$1', $string));
+ }
+
}