[cmucl-cvs] [git] CMU Common Lisp branch master updated. snapshot-2012-08-7-g88d77e8

26 Aug 2012

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMU Common Lisp".

The branch, master has been updated
       via  88d77e83c17fd5d48be454542389d3ff71a27b4b (commit)
      from  a176515cc4356f3f6de43391604e90c7e8391b99 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 88d77e83c17fd5d48be454542389d3ff71a27b4b
Author: Raymond Toy <toy.raymond@gmail.com>
Date:   Sat Aug 25 08:48:51 2012 -0700

    Change %primitive print.to output strings in utf8 instead of utf16.
    No more random NUL ASCII characters on output now.

diff --git a/src/lisp/interr.c b/src/lisp/interr.c
index 57e18f2..90c7f7d 100644
--- a/src/lisp/interr.c
+++ b/src/lisp/interr.c
@@ -160,6 +160,152 @@ internal_error(os_context_t * context)
 
 /* Utility routines used by random pieces of code. */
 
+#if defined(UNICODE)
+/*
+ * Convert a unicode code point to a set of utf8-encoded octets to
+ * standard output.  This is the algorithm used by the Lisp utf8
+ * encoder in src/code/extfmts.lisp.
+ */
+static void
+utf8(int code, int len)
+{
+    int k;
+    int j = 6 - len;
+    int p = 6 * len;
+    int init = 0xff & (0x7e << j);
+    int c;
+
+    /*
+     * (ldb (byte j p) code): Extract j bits from position p of the code
+     */
+    c = (code >> p) & ((1 << j) - 1);
+    
+    putchar(init | c);
+
+    for (k = 0; k < len; ++k) {
+        p -= 6;
+        /* (ldb (byte 6 p) code) */
+        c = (code >> p) & ((1 << 6) - 1);
+        putchar(128 | c);
+    }
+}
+
+/*
+ * Test if code is a surrogate.  Returns true if so. If the code is a
+ * surrogate, then type indicates if it is a high (0) or low (1)
+ * surrogate.  If not a surrogate, type is not modified.  If type is
+ * NULL, then no type is returned.
+ */
+boolean
+surrogatep(int code, int *type)
+{
+    boolean result;
+
+    if ((code >> 11) == 0x1b) {
+        result = 1;
+        if (type) {
+            *type = (code >> 10) & 1;
+        }
+    } else {
+        result = 0;
+    }
+
+    return result;
+}
+
+/*
+ * Convert one or two utf16 code units into a code point.  utf16
+ * points to the string, len is the length of the string.  The
+ * codepoint is returned and the number of code units consumed is
+ * returned in consumed.
+ */
+int
+utf16_codepoint(unsigned short int* utf16, int len, int* consumed)
+{
+    int code = *utf16;
+    int read = 1;
+    
+    int code_type;
+
+    /*
+     * If the current code unit is not a surrogate, we're done.
+     * Otherwise process the surrogate
+     */
+    
+    if (surrogatep(code, &code_type)) {
+        /*
+         * Try to get the following surrogate, if there are still code
+         * units left.  If not, we have a bare surrogate, so just
+         * return the replacement character.
+         */
+        if (len > 0) {
+            int next = utf16[1];
+            int next_type;
+            if (surrogatep(next, &next_type)) {
+                /* Got the following surrogate, so combine them if possible */
+                if ((code_type == 0) && (next_type == 1)) {
+                    /* High followed by low surrogate */
+                    code = ((code - 0xd800) << 10) + next + 0x2400;
+                    ++read;
+                } else if ((code_type == 1) && (next_type == 0)) {
+                    /* Low followed by high surrogate */
+                    code = ((code - 0xd800) << 10) + next + 0x2400;;
+                    ++read;
+                } else {
+                    /* Give up */
+                    code = 0xfffd;
+                }
+            } else {
+                /* Surrogate followed by non-surrogate. Give up */
+                code = 0xfffd;
+            }
+        } else {
+            code = 0xfffd;
+        }
+    }
+
+    *consumed = read;
+    return code;
+}
+
+/*
+ * Send the utf-16 Lisp unicode string to standard output as a
+ * utf8-encoded sequence of octets.
+ */
+static void
+utf16_output(unsigned short int* utf16, int len)
+{
+    while (len) {
+        int consumed;
+        int code = utf16_codepoint(utf16, len, &consumed);
+
+        /* Output the codepoint */
+        if (code < 0x80) {
+            putchar(code);
+        } else if (code < 0x800) {
+            utf8(code, 1);
+        } else if (code < 0x10000) {
+            utf8(code, 2);
+        } else if (code < 0x110000) {
+            utf8(code, 3);
+        } else {
+            /*
+             * This shouldn't happen, but if it does we don't want to
+             * signal any kind of error so just output a question mark
+             * so we can continue.
+             */
+            putchar('?');
+        }
+
+        len -= consumed;
+        utf16 += consumed;
+    }
+}
+#endif
+
+/*
+ * debug_print is used by %primitive print to output a string.
+ */
 lispobj
 debug_print(lispobj object)
 {
@@ -178,13 +324,7 @@ debug_print(lispobj object)
             len = lisp_string->length >> 2;
             lisp_chars = (unsigned short int*) lisp_string->data;
 
-            /*
-             * Do we really want to dump out the entire contents of
-             * the utf-16 string?  Should we just print out the low 8
-             * bits of each Lisp character?  Or maybe convert the
-             * utf-16 string to some more suitable encoding?
-             */
-            fwrite(lisp_chars, sizeof(*lisp_chars), len, stdout);
+            utf16_output(lisp_chars, len);
             putchar('\n');
     
             fflush(stdout);
@@ -192,6 +332,11 @@ debug_print(lispobj object)
             print(object);
         }
     } else {
+        /*
+         * We should actually ever get here because %primitive print
+         * is only supposed to take strings.  But if we do, it's
+         * useful to print something out anyway.
+         */
 #if 1
 	printf("obj @0x%lx: ", (unsigned long) object);
 #endif

-----------------------------------------------------------------------

Summary of changes:
 src/lisp/interr.c |  159 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 152 insertions(+), 7 deletions(-)


hooks/post-receive
-- 
CMU Common Lisp

    

Raymond Toy

tags

participants (1)