mirror of
https://github.com/vishapoberon/compiler.git
synced 2026-04-05 23:22:25 +00:00
eth unicode conversion module. -- noch
This commit is contained in:
parent
8b846c9a5a
commit
c2794dc7fb
12 changed files with 227 additions and 10 deletions
1
makefile
1
makefile
|
|
@ -247,6 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
|
||||
# build remaining tools
|
||||
|
|
|
|||
|
|
@ -249,7 +249,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ stage6:
|
|||
$(VOCSTATIC) -sP ethRandomNumbers.Mod
|
||||
$(VOCSTATIC) -sP ethGZReaders.Mod
|
||||
$(VOCSTATIC) -sP ethGZWriters.Mod
|
||||
|
||||
$(VOCSTATIC) -sP ethUnicode.Mod
|
||||
|
||||
# build remaining tools
|
||||
# $(VOCSTATIC0) -sPS compatIn.Mod
|
||||
|
|
|
|||
216
src/lib/s3/ethUnicode.Mod
Normal file
216
src/lib/s3/ethUnicode.Mod
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
(* ETH Oberon, Copyright 2000 ETH Zuerich Institut fuer Computersysteme, ETH Zentrum, CH-8092 Zuerich.
|
||||
Refer to the "General ETH Oberon System Source License" contract available at: http://www.oberon.ethz.ch/ *)
|
||||
|
||||
MODULE ethUnicode; (* be *)
|
||||
|
||||
IMPORT SYSTEM;
|
||||
|
||||
PROCEDURE AND(a,b: LONGINT): LONGINT;
|
||||
BEGIN RETURN SYSTEM.VAL(LONGINT, SYSTEM.VAL(SET, a) * SYSTEM.VAL(SET, b))
|
||||
END AND;
|
||||
|
||||
(** UCStoUTF8 - converts a single unicode-character to one UTF-8 character. The UTF-8 character is written
|
||||
into 'utf8' starting at position 'pos' that points immediatly behind the inserted character.
|
||||
Returns TRUE if the conversion was successful *)
|
||||
PROCEDURE UCStoUTF8*(ucs: LONGINT; VAR utf8: ARRAY OF CHAR; VAR pos: LONGINT): BOOLEAN;
|
||||
VAR len: LONGINT;
|
||||
byte, mask, max, i: INTEGER;
|
||||
buf: ARRAY 6 OF CHAR;
|
||||
BEGIN
|
||||
len := LEN(utf8);
|
||||
|
||||
IF (ucs <= 7FH) THEN
|
||||
IF (pos + 1 < len) THEN utf8[pos] := CHR(SHORT(ucs));
|
||||
utf8[pos+1] := 0X;
|
||||
pos := pos + 1
|
||||
ELSE RETURN FALSE
|
||||
END
|
||||
ELSE
|
||||
byte := 0; mask := 7F80H; max := 3FH;
|
||||
|
||||
WHILE (ucs > max) DO
|
||||
buf[byte] := CHR(80H + SHORT(AND(ucs, 3FH))); INC(byte);
|
||||
ucs := ucs DIV 64; (* SYSTEM.LSH(ucs, -6) *)
|
||||
mask := mask DIV 2; (* 80H + SYSTEM.LSH(mask, -1) *)
|
||||
max := max DIV 2; (* SYSTEM.LSH(max, -1) *)
|
||||
END;
|
||||
buf[byte] := CHR(mask + SHORT(ucs));
|
||||
|
||||
IF (pos + byte + 1 < len) THEN
|
||||
FOR i := 0 TO byte DO utf8[pos + i] := buf[byte - i] END;
|
||||
utf8[pos+byte+1] := 0X;
|
||||
pos := pos + byte + 1
|
||||
ELSE RETURN FALSE
|
||||
END
|
||||
END;
|
||||
RETURN TRUE
|
||||
END UCStoUTF8;
|
||||
|
||||
(** UCS2toUTF8 - converts an array of 16-bit unicode characters to a UTF-8 string *)
|
||||
PROCEDURE UCS2toUTF8*(VAR ucs2: ARRAY OF INTEGER; VAR utf8: ARRAY OF CHAR);
|
||||
VAR i, p: LONGINT;
|
||||
b: BOOLEAN;
|
||||
BEGIN
|
||||
b := TRUE; i := 0; p := 0;
|
||||
WHILE (i < LEN(ucs2)) & b DO
|
||||
b := UCStoUTF8(ucs2[i], utf8, p);
|
||||
INC(i)
|
||||
END
|
||||
END UCS2toUTF8;
|
||||
|
||||
(** UCS4toUTF8 - converts an array of 32-bit unicode characters to an UTF-8 string *)
|
||||
PROCEDURE UCS4toUTF8*(VAR ucs4: ARRAY OF LONGINT; VAR utf8: ARRAY OF CHAR);
|
||||
VAR i, p: LONGINT;
|
||||
b: BOOLEAN;
|
||||
BEGIN
|
||||
b := TRUE; i := 0; p := 0;
|
||||
WHILE (i < LEN(ucs4)) & b DO
|
||||
b := UCStoUTF8(ucs4[i], utf8, p);
|
||||
INC(i)
|
||||
END
|
||||
END UCS4toUTF8;
|
||||
|
||||
(** UTF8toUCS - converts the UTF-8 character in the string 'utf8' at position 'p' into an unicode character.
|
||||
Returns TRUE if the conversion was successful *)
|
||||
PROCEDURE UTF8toUCS*(VAR utf8: ARRAY OF CHAR; VAR p: LONGINT; VAR ucs: LONGINT): BOOLEAN;
|
||||
VAR b: LONGINT;
|
||||
bytes, mask, i: INTEGER;
|
||||
s: SET;
|
||||
res: BOOLEAN;
|
||||
BEGIN
|
||||
res := FALSE;
|
||||
IF (p < LEN(utf8)) THEN
|
||||
b := ORD(utf8[p]);
|
||||
IF (b < 80H) THEN ucs := b; INC(p); res := TRUE
|
||||
ELSE
|
||||
bytes := 2; mask := 3FH; s := SYSTEM.VAL(SET, b);
|
||||
WHILE ((7-bytes) IN s) DO INC(bytes); mask := mask DIV 2 END;
|
||||
ucs := AND(b, mask);
|
||||
IF (p + bytes - 1 < LEN(utf8))THEN
|
||||
FOR i := 1 TO bytes-1 DO ucs := ucs * 64 + AND(ORD(utf8[p+i]), 3FH) END;
|
||||
p := p + bytes;
|
||||
res := TRUE
|
||||
END
|
||||
END
|
||||
END;
|
||||
RETURN res
|
||||
END UTF8toUCS;
|
||||
|
||||
(** UTF8toUCS2 - converts an UTF-8 string into an array of 16-bit unicode characters. The first character is placed
|
||||
at position 'idx'. Returns TRUE if the conversion was successful *)
|
||||
PROCEDURE UTF8toUCS2*(VAR utf8: ARRAY OF CHAR; VAR ucs2: ARRAY OF INTEGER; VAR idx: LONGINT): BOOLEAN;
|
||||
VAR p, ucs: LONGINT;
|
||||
BEGIN
|
||||
p := 0;
|
||||
WHILE UTF8toUCS(utf8, p, ucs) & (ucs > 0) & (idx < LEN(ucs2)-1) DO
|
||||
IF (ucs <= MAX(INTEGER)) THEN ucs2[idx] := SHORT(ucs)
|
||||
ELSE ucs2[0] := 0; RETURN FALSE
|
||||
END;
|
||||
INC(idx)
|
||||
END;
|
||||
IF (idx < LEN(ucs2)) THEN ucs2[idx] := 0; INC(idx) END;
|
||||
RETURN TRUE
|
||||
END UTF8toUCS2;
|
||||
|
||||
(** UTF8toUCS4 - converts an UTF-8 string into an array of 32-bit unicode characters. The first character is placed
|
||||
at position 'idx'. Returns TRUE if the conversion was successful *)
|
||||
PROCEDURE UTF8toUCS4*(VAR utf8: ARRAY OF CHAR; VAR ucs4: ARRAY OF LONGINT; VAR idx: LONGINT);
|
||||
VAR p: LONGINT;
|
||||
BEGIN
|
||||
p := 0;
|
||||
WHILE (idx < LEN(ucs4)) & UTF8toUCS(utf8, p, ucs4[idx]) & (ucs4[idx] > 0) DO
|
||||
INC(idx)
|
||||
END;
|
||||
IF (idx < LEN(ucs4)) THEN ucs4[idx] := 0; INC(idx) END
|
||||
END UTF8toUCS4;
|
||||
|
||||
(** UTF8toASCII - converts an UTF8-string into an ASCII-string. 'lossy' is TRUE if some information was lost during the
|
||||
conversion. Returns TRUE if the conversion was successful *)
|
||||
PROCEDURE UTF8toASCII*(utf8: ARRAY OF CHAR; VAR ascii: ARRAY OF CHAR; VAR lossy: BOOLEAN): BOOLEAN;
|
||||
VAR p, idx, ucs: LONGINT;
|
||||
BEGIN
|
||||
p := 0; idx := 0; ucs := -1;
|
||||
WHILE (ucs # 0) & UTF8toUCS(utf8, p, ucs) & (idx < LEN(ascii)) DO
|
||||
IF (ucs >= 0) & (ucs < 256) THEN ascii[idx] := CHR(ucs)
|
||||
ELSE ascii[idx] := "_"
|
||||
END;
|
||||
INC(idx)
|
||||
END;
|
||||
IF (ascii[idx-1] # 0X) & (idx < LEN(ascii)) THEN ascii[idx] := 0X; INC(idx) END;
|
||||
RETURN ascii[idx-1] = 0X
|
||||
END UTF8toASCII;
|
||||
|
||||
(** ASCIItoUTF8 - converts an ASCII-string into an UTF8-string *)
|
||||
PROCEDURE ASCIItoUTF8*(ascii: ARRAY OF CHAR; VAR utf8: ARRAY OF CHAR);
|
||||
VAR l, i: LONGINT;
|
||||
ucs: POINTER TO ARRAY OF INTEGER;
|
||||
BEGIN
|
||||
l := 0; WHILE (ascii[l] # 0X) DO INC(l) END;
|
||||
NEW(ucs, l);
|
||||
FOR i := 0 TO l-1 DO ucs[i] := ORD(ascii[i]) END;
|
||||
UCS2toUTF8(ucs^, utf8)
|
||||
END ASCIItoUTF8;
|
||||
|
||||
(** UpperCh - returns the upper case of a character. 'lossy' is TRUE if some information was lost during the conversion. *)
|
||||
PROCEDURE UpperCh*(ch: CHAR; VAR lossy: BOOLEAN): CHAR;
|
||||
BEGIN
|
||||
lossy := TRUE;
|
||||
CASE ch OF
|
||||
"a" .. "z": ch := CAP(ch); lossy := FALSE |
|
||||
"0".."9", "A".."Z", "€", "<22>", "‚", "$", ".", "%", "'", "-", "_", "@", "~", "`", "!", "(", ")", "{", "}", "^", "#", "&": lossy := FALSE |
|
||||
"ƒ": ch := "€"; lossy := FALSE |
|
||||
"„": ch := "<22>"; lossy := FALSE |
|
||||
"…": ch := "‚"; lossy := FALSE |
|
||||
"†": ch := "A" |
|
||||
"‡": ch := "E" |
|
||||
"ˆ": ch := "I" |
|
||||
"‰": ch := "O" |
|
||||
"Š": ch := "U" |
|
||||
"‹": ch := "A" |
|
||||
"Œ": ch := "E" |
|
||||
"<22>": ch := "I" |
|
||||
"Ž": ch := "O" |
|
||||
"<22>": ch := "U" |
|
||||
"<22>": ch := "E" |
|
||||
"‘": ch := "E" |
|
||||
"’": ch := "I" |
|
||||
"“": ch := "C" |
|
||||
"”": ch := "A" |
|
||||
"•": ch := "N" |
|
||||
"–": ch := "S"
|
||||
ELSE
|
||||
END;
|
||||
RETURN ch
|
||||
END UpperCh;
|
||||
|
||||
(** Length - returns the length of a string *)
|
||||
PROCEDURE Length*(VAR s: ARRAY OF CHAR): LONGINT;
|
||||
VAR p, l: LONGINT;
|
||||
BEGIN
|
||||
l := LEN(s); p := 0;
|
||||
WHILE (p < l) & (s[p] # 0X) DO INC(p) END;
|
||||
RETURN p
|
||||
END Length;
|
||||
|
||||
(** Append - appends 'this' to 'to' *)
|
||||
PROCEDURE Append*(VAR to: ARRAY OF CHAR; this: ARRAY OF CHAR);
|
||||
VAR i, j, l: LONGINT;
|
||||
BEGIN
|
||||
i := 0; WHILE to[i] # 0X DO INC(i) END;
|
||||
l := LEN(to)-1; j := 0;
|
||||
WHILE (i < l) & (this[j] # 0X) DO to[i] := this[j]; INC(i); INC(j) END;
|
||||
to[i] := 0X
|
||||
END Append;
|
||||
|
||||
(** Prepend - appends 'to' to 'this' *)
|
||||
PROCEDURE Prepend*(VAR to: ARRAY OF CHAR; this: ARRAY OF CHAR);
|
||||
VAR tmp: POINTER TO ARRAY OF CHAR;
|
||||
BEGIN
|
||||
NEW(tmp, LEN(to));
|
||||
COPY(this, tmp^);
|
||||
Append(tmp^, to);
|
||||
COPY(tmp^, to)
|
||||
END Prepend;
|
||||
|
||||
END ethUnicode.
|
||||
ÛBIER¸Éx:ZÿÿÿÿCOberon10.Scn.Fnt29.03.01 17:55:31TimeStamps.New€
|
||||
Loading…
Add table
Add a link
Reference in a new issue