From 2287af21912af901858e2dc6aa3a073b0e8e6b03 Mon Sep 17 00:00:00 2001 From: Norayr Chilingarian Date: Tue, 27 Jan 2015 10:54:03 +0400 Subject: [PATCH] eth unicode conversion module. -- noch Former-commit-id: c2794dc7fbf9ce484f405f8b71f025864489e3f4 --- makefile | 1 + makefile.darwin.clang.x86_64 | 2 +- makefile.freebsd.clang.x86_64 | 2 +- makefile.linux.clang.powerpc | 2 +- makefile.linux.clang.x86_64 | 2 +- makefile.linux.gcc.armv6j | 2 +- makefile.linux.gcc.armv6j_hardfp | 2 +- makefile.linux.gcc.armv7a_hardfp | 2 +- makefile.linux.gcc.powerpc | 2 +- makefile.linux.gcc.x86 | 2 +- makefile.linux.gcc.x86_64 | 2 +- src/lib/s3/ethUnicode.Mod | 216 +++++++++++++++++++++++++++++++ 12 files changed, 227 insertions(+), 10 deletions(-) create mode 100644 src/lib/s3/ethUnicode.Mod diff --git a/makefile b/makefile index 3585dfa6..b12dd440 100644 --- a/makefile +++ b/makefile @@ -247,6 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools diff --git a/makefile.darwin.clang.x86_64 b/makefile.darwin.clang.x86_64 index 9723fb10..ae42f938 100644 --- a/makefile.darwin.clang.x86_64 +++ b/makefile.darwin.clang.x86_64 @@ -249,7 +249,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.freebsd.clang.x86_64 b/makefile.freebsd.clang.x86_64 index 6b2db347..a6ecf136 100644 --- a/makefile.freebsd.clang.x86_64 +++ b/makefile.freebsd.clang.x86_64 @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.clang.powerpc b/makefile.linux.clang.powerpc index 396377f3..72456fe6 100644 --- a/makefile.linux.clang.powerpc +++ b/makefile.linux.clang.powerpc @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.clang.x86_64 b/makefile.linux.clang.x86_64 index 8b70052b..93c292e6 100644 --- a/makefile.linux.clang.x86_64 +++ b/makefile.linux.clang.x86_64 @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.armv6j b/makefile.linux.gcc.armv6j index d3461e88..05d16a1c 100644 --- a/makefile.linux.gcc.armv6j +++ b/makefile.linux.gcc.armv6j @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.armv6j_hardfp b/makefile.linux.gcc.armv6j_hardfp index df8d9bbe..ecd54f8a 100644 --- a/makefile.linux.gcc.armv6j_hardfp +++ b/makefile.linux.gcc.armv6j_hardfp @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.armv7a_hardfp b/makefile.linux.gcc.armv7a_hardfp index 696792cb..c3f872c3 100644 --- a/makefile.linux.gcc.armv7a_hardfp +++ b/makefile.linux.gcc.armv7a_hardfp @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.powerpc b/makefile.linux.gcc.powerpc index 154e5e98..36959e0c 100644 --- a/makefile.linux.gcc.powerpc +++ b/makefile.linux.gcc.powerpc @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.x86 b/makefile.linux.gcc.x86 index 2eeba06b..7a03a73c 100644 --- a/makefile.linux.gcc.x86 +++ b/makefile.linux.gcc.x86 @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/makefile.linux.gcc.x86_64 b/makefile.linux.gcc.x86_64 index 3585dfa6..66282a59 100644 --- a/makefile.linux.gcc.x86_64 +++ b/makefile.linux.gcc.x86_64 @@ -247,7 +247,7 @@ stage6: $(VOCSTATIC) -sP ethRandomNumbers.Mod $(VOCSTATIC) -sP ethGZReaders.Mod $(VOCSTATIC) -sP ethGZWriters.Mod - + $(VOCSTATIC) -sP ethUnicode.Mod # build remaining tools # $(VOCSTATIC0) -sPS compatIn.Mod diff --git a/src/lib/s3/ethUnicode.Mod b/src/lib/s3/ethUnicode.Mod new file mode 100644 index 00000000..7d90f652 --- /dev/null +++ b/src/lib/s3/ethUnicode.Mod @@ -0,0 +1,216 @@ +(* ETH Oberon, Copyright 2000 ETH Zuerich Institut fuer Computersysteme, ETH Zentrum, CH-8092 Zuerich. +Refer to the "General ETH Oberon System Source License" contract available at: http://www.oberon.ethz.ch/ *) + +MODULE ethUnicode; (* be *) + +IMPORT SYSTEM; + +PROCEDURE AND(a,b: LONGINT): LONGINT; +BEGIN RETURN SYSTEM.VAL(LONGINT, SYSTEM.VAL(SET, a) * SYSTEM.VAL(SET, b)) +END AND; + +(** UCStoUTF8 - converts a single unicode-character to one UTF-8 character. The UTF-8 character is written + into 'utf8' starting at position 'pos' that points immediatly behind the inserted character. + Returns TRUE if the conversion was successful *) +PROCEDURE UCStoUTF8*(ucs: LONGINT; VAR utf8: ARRAY OF CHAR; VAR pos: LONGINT): BOOLEAN; +VAR len: LONGINT; + byte, mask, max, i: INTEGER; + buf: ARRAY 6 OF CHAR; +BEGIN + len := LEN(utf8); + + IF (ucs <= 7FH) THEN + IF (pos + 1 < len) THEN utf8[pos] := CHR(SHORT(ucs)); + utf8[pos+1] := 0X; + pos := pos + 1 + ELSE RETURN FALSE + END + ELSE + byte := 0; mask := 7F80H; max := 3FH; + + WHILE (ucs > max) DO + buf[byte] := CHR(80H + SHORT(AND(ucs, 3FH))); INC(byte); + ucs := ucs DIV 64; (* SYSTEM.LSH(ucs, -6) *) + mask := mask DIV 2; (* 80H + SYSTEM.LSH(mask, -1) *) + max := max DIV 2; (* SYSTEM.LSH(max, -1) *) + END; + buf[byte] := CHR(mask + SHORT(ucs)); + + IF (pos + byte + 1 < len) THEN + FOR i := 0 TO byte DO utf8[pos + i] := buf[byte - i] END; + utf8[pos+byte+1] := 0X; + pos := pos + byte + 1 + ELSE RETURN FALSE + END + END; + RETURN TRUE +END UCStoUTF8; + +(** UCS2toUTF8 - converts an array of 16-bit unicode characters to a UTF-8 string *) +PROCEDURE UCS2toUTF8*(VAR ucs2: ARRAY OF INTEGER; VAR utf8: ARRAY OF CHAR); +VAR i, p: LONGINT; + b: BOOLEAN; +BEGIN + b := TRUE; i := 0; p := 0; + WHILE (i < LEN(ucs2)) & b DO + b := UCStoUTF8(ucs2[i], utf8, p); + INC(i) + END +END UCS2toUTF8; + +(** UCS4toUTF8 - converts an array of 32-bit unicode characters to an UTF-8 string *) +PROCEDURE UCS4toUTF8*(VAR ucs4: ARRAY OF LONGINT; VAR utf8: ARRAY OF CHAR); +VAR i, p: LONGINT; + b: BOOLEAN; +BEGIN + b := TRUE; i := 0; p := 0; + WHILE (i < LEN(ucs4)) & b DO + b := UCStoUTF8(ucs4[i], utf8, p); + INC(i) + END +END UCS4toUTF8; + +(** UTF8toUCS - converts the UTF-8 character in the string 'utf8' at position 'p' into an unicode character. + Returns TRUE if the conversion was successful *) +PROCEDURE UTF8toUCS*(VAR utf8: ARRAY OF CHAR; VAR p: LONGINT; VAR ucs: LONGINT): BOOLEAN; +VAR b: LONGINT; + bytes, mask, i: INTEGER; + s: SET; + res: BOOLEAN; +BEGIN + res := FALSE; + IF (p < LEN(utf8)) THEN + b := ORD(utf8[p]); + IF (b < 80H) THEN ucs := b; INC(p); res := TRUE + ELSE + bytes := 2; mask := 3FH; s := SYSTEM.VAL(SET, b); + WHILE ((7-bytes) IN s) DO INC(bytes); mask := mask DIV 2 END; + ucs := AND(b, mask); + IF (p + bytes - 1 < LEN(utf8))THEN + FOR i := 1 TO bytes-1 DO ucs := ucs * 64 + AND(ORD(utf8[p+i]), 3FH) END; + p := p + bytes; + res := TRUE + END + END + END; + RETURN res +END UTF8toUCS; + +(** UTF8toUCS2 - converts an UTF-8 string into an array of 16-bit unicode characters. The first character is placed + at position 'idx'. Returns TRUE if the conversion was successful *) +PROCEDURE UTF8toUCS2*(VAR utf8: ARRAY OF CHAR; VAR ucs2: ARRAY OF INTEGER; VAR idx: LONGINT): BOOLEAN; +VAR p, ucs: LONGINT; +BEGIN + p := 0; + WHILE UTF8toUCS(utf8, p, ucs) & (ucs > 0) & (idx < LEN(ucs2)-1) DO + IF (ucs <= MAX(INTEGER)) THEN ucs2[idx] := SHORT(ucs) + ELSE ucs2[0] := 0; RETURN FALSE + END; + INC(idx) + END; + IF (idx < LEN(ucs2)) THEN ucs2[idx] := 0; INC(idx) END; + RETURN TRUE +END UTF8toUCS2; + +(** UTF8toUCS4 - converts an UTF-8 string into an array of 32-bit unicode characters. The first character is placed + at position 'idx'. Returns TRUE if the conversion was successful *) +PROCEDURE UTF8toUCS4*(VAR utf8: ARRAY OF CHAR; VAR ucs4: ARRAY OF LONGINT; VAR idx: LONGINT); +VAR p: LONGINT; +BEGIN + p := 0; + WHILE (idx < LEN(ucs4)) & UTF8toUCS(utf8, p, ucs4[idx]) & (ucs4[idx] > 0) DO + INC(idx) + END; + IF (idx < LEN(ucs4)) THEN ucs4[idx] := 0; INC(idx) END +END UTF8toUCS4; + +(** UTF8toASCII - converts an UTF8-string into an ASCII-string. 'lossy' is TRUE if some information was lost during the + conversion. Returns TRUE if the conversion was successful *) +PROCEDURE UTF8toASCII*(utf8: ARRAY OF CHAR; VAR ascii: ARRAY OF CHAR; VAR lossy: BOOLEAN): BOOLEAN; +VAR p, idx, ucs: LONGINT; +BEGIN + p := 0; idx := 0; ucs := -1; + WHILE (ucs # 0) & UTF8toUCS(utf8, p, ucs) & (idx < LEN(ascii)) DO + IF (ucs >= 0) & (ucs < 256) THEN ascii[idx] := CHR(ucs) + ELSE ascii[idx] := "_" + END; + INC(idx) + END; + IF (ascii[idx-1] # 0X) & (idx < LEN(ascii)) THEN ascii[idx] := 0X; INC(idx) END; + RETURN ascii[idx-1] = 0X +END UTF8toASCII; + +(** ASCIItoUTF8 - converts an ASCII-string into an UTF8-string *) +PROCEDURE ASCIItoUTF8*(ascii: ARRAY OF CHAR; VAR utf8: ARRAY OF CHAR); +VAR l, i: LONGINT; + ucs: POINTER TO ARRAY OF INTEGER; +BEGIN + l := 0; WHILE (ascii[l] # 0X) DO INC(l) END; + NEW(ucs, l); + FOR i := 0 TO l-1 DO ucs[i] := ORD(ascii[i]) END; + UCS2toUTF8(ucs^, utf8) +END ASCIItoUTF8; + +(** UpperCh - returns the upper case of a character. 'lossy' is TRUE if some information was lost during the conversion. *) +PROCEDURE UpperCh*(ch: CHAR; VAR lossy: BOOLEAN): CHAR; +BEGIN + lossy := TRUE; + CASE ch OF + "a" .. "z": ch := CAP(ch); lossy := FALSE | + "0".."9", "A".."Z", "€", "", "‚", "$", ".", "%", "'", "-", "_", "@", "~", "`", "!", "(", ")", "{", "}", "^", "#", "&": lossy := FALSE | + "ƒ": ch := "€"; lossy := FALSE | + "„": ch := ""; lossy := FALSE | + "…": ch := "‚"; lossy := FALSE | + "†": ch := "A" | + "‡": ch := "E" | + "ˆ": ch := "I" | + "‰": ch := "O" | + "Š": ch := "U" | + "‹": ch := "A" | + "Œ": ch := "E" | + "": ch := "I" | + "Ž": ch := "O" | + "": ch := "U" | + "": ch := "E" | + "‘": ch := "E" | + "’": ch := "I" | + "“": ch := "C" | + "”": ch := "A" | + "•": ch := "N" | + "–": ch := "S" + ELSE + END; + RETURN ch +END UpperCh; + +(** Length - returns the length of a string *) +PROCEDURE Length*(VAR s: ARRAY OF CHAR): LONGINT; +VAR p, l: LONGINT; +BEGIN + l := LEN(s); p := 0; + WHILE (p < l) & (s[p] # 0X) DO INC(p) END; + RETURN p +END Length; + +(** Append - appends 'this' to 'to' *) +PROCEDURE Append*(VAR to: ARRAY OF CHAR; this: ARRAY OF CHAR); +VAR i, j, l: LONGINT; +BEGIN + i := 0; WHILE to[i] # 0X DO INC(i) END; + l := LEN(to)-1; j := 0; + WHILE (i < l) & (this[j] # 0X) DO to[i] := this[j]; INC(i); INC(j) END; + to[i] := 0X +END Append; + +(** Prepend - appends 'to' to 'this' *) +PROCEDURE Prepend*(VAR to: ARRAY OF CHAR; this: ARRAY OF CHAR); +VAR tmp: POINTER TO ARRAY OF CHAR; +BEGIN + NEW(tmp, LEN(to)); + COPY(this, tmp^); + Append(tmp^, to); + COPY(tmp^, to) +END Prepend; + +END ethUnicode. +ÛBIER¸Éx:ZÿÿÿÿCOberon10.Scn.Fnt29.03.01 17:55:31TimeStamps.New€