cvsuser 05/02/28 02:41:20
Modified: . MANIFEST
charset ascii.c iso-8859-1.c
imcc/docs syntax.pod
imcc imcc.l pbc.c symreg.c
include/parrot charset.h
src charset.c string.c
Added: t/op string_cs.t
Log:
Strings. Finally. 1 - charset syntax for constants
Strings. Finally. - This was the subject of Dan's design
document.
* implement charset: syntax for lexer
* ascii, binary, iso-8859-1 are now valid charsets
* start new test file for charsets
Revision Changes Path
1.835 +1 -0 parrot/MANIFEST
Index: MANIFEST
===================================================================
RCS file: /cvs/public/parrot/MANIFEST,v
retrieving revision 1.834
retrieving revision 1.835
diff -u -r1.834 -r1.835
--- MANIFEST 28 Feb 2005 08:30:35 -0000 1.834
+++ MANIFEST 28 Feb 2005 10:41:11 -0000 1.835
@@no-spam -2857,6 +2857,7 @@no-spam
t/op/rx.t []
t/op/stacks.t []
t/op/string.t []
+t/op/string_cs.t []
t/op/stringu.t []
t/op/time.t []
t/op/trans.t []
1.1 parrot/t/op/string_cs.t
Index: string_cs.t
===================================================================
#! perl -w
# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
# $Id: string_cs.t,v 1.1 2005/02/28 10:41:15 leo Exp $
=head1 NAME
t/op/string_cs.t - String Charset Tests
=head1 SYNOPSIS
% perl -Ilib t/op/string_cs.t
=head1 DESCRIPTION
Tests charset support.
=cut
use Parrot::Test tests => 1;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
set S0, ascii:"ok 1\n"
print S0
set S0, binary:"ok 2\n"
print S0
set S0, iso-8859-1:"ok 3\n"
print S0
end
CODE
ok 1
ok 2
ok 3
OUTPUT
1.8 +2 -2 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- ascii.c 28 Feb 2005 08:29:30 -0000 1.7
+++ ascii.c 28 Feb 2005 10:41:16 -0000 1.8
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.7 2005/02/28 08:29:30 leo Exp $
+$Id: ascii.c,v 1.8 2005/02/28 10:41:16 leo Exp $
=head1 NAME
@@no-spam -20,7 +20,7 @@no-spam
/* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
-static char typetable[256] = {
+static const unsigned char typetable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
1.6 +8 -3 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- iso-8859-1.c 28 Feb 2005 08:29:30 -0000 1.5
+++ iso-8859-1.c 28 Feb 2005 10:41:16 -0000 1.6
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.5 2005/02/28 08:29:30 leo Exp $
+$Id: iso-8859-1.c,v 1.6 2005/02/28 10:41:16 leo Exp $
=head1 NAME
@@no-spam -25,7 +25,7 @@no-spam
#define PUNCTUATION 3
#define DIGIT 4
-static unsigned char typetable[256] = {
+static const unsigned char typetable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
@@no-spam -122,9 +122,14 @@no-spam
static void
to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
{
- void *conversion_func;
+ charset_converter_t conversion_func;
if ((conversion_func = Parrot_find_charset_converter(interpreter,
source_string->charset, new_charset))) {
+ /*
+ * XXX conversion_func has wrong signature ?
+ *
+ * conversion_func(interpreter, new_charset, source_string);
+ */
}
else {
to_unicode(interpreter, source_string);
1.12 +6 -1 parrot/imcc/docs/syntax.pod
Index: syntax.pod
===================================================================
RCS file: /cvs/public/parrot/imcc/docs/syntax.pod,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- syntax.pod 20 Feb 2005 19:25:35 -0000 1.11
+++ syntax.pod 28 Feb 2005 10:41:17 -0000 1.12
@@no-spam -1,5 +1,5 @@no-spam
# Copyright: 2001-2005 The Perl Foundation. All Rights Reserved.
-# $Id: syntax.pod,v 1.11 2005/02/20 19:25:35 bernhard Exp $
+# $Id: syntax.pod,v 1.12 2005/02/28 10:41:17 leo Exp $
=head1 NAME
@@no-spam -85,6 +85,11 @@no-spam
Are delimited by B<">. A B<"> inside a string must be escaped by
B<\">.
+=item charset:"string constant"
+
+Like above with a charset attached to the string. Valid charset are
+currently: C<ascii>, C<binary>, and the default C<iso-8859-1>.
+
=item 'char constant'
Are delimited by B<'>.
1.124 +3 -4 parrot/imcc/imcc.l
Index: imcc.l
===================================================================
RCS file: /cvs/public/parrot/imcc/imcc.l,v
retrieving revision 1.123
retrieving revision 1.124
diff -u -r1.123 -r1.124
--- imcc.l 30 Nov 2004 09:35:10 -0000 1.123
+++ imcc.l 28 Feb 2005 10:41:18 -0000 1.124
@@no-spam -450,10 +450,9 @@no-spam
}
<*>{UNICODE} {
- char *p = strchr(yytext, '"');
- valp->s = str_dup(p); /* enc:"..." */
- /* TODO pass charset */
- return(USTRINGC); /* XXX delete quotes, -> emit, pbc */
+ valp->s = str_dup(yytext); /* charset:"..." */
+ /* this is actually not unicode but a string with a charset */
+ return(USTRINGC);
}
<*>{CHARCONSTANT} {
valp->s = str_dup(yytext); /* XXX delete quotes, -> emit, pbc */
1.108 +11 -8 parrot/imcc/pbc.c
Index: pbc.c
===================================================================
RCS file: /cvs/public/parrot/imcc/pbc.c,v
retrieving revision 1.107
retrieving revision 1.108
diff -u -r1.107 -r1.108
--- pbc.c 18 Feb 2005 13:41:27 -0000 1.107
+++ pbc.c 28 Feb 2005 10:41:18 -0000 1.108
@@no-spam -511,16 +511,19 @@no-spam
STRING *s = NULL;
char *charset = NULL;
/*
- * TODO strip delimiters in lexer, this needs adjustment in printint strings
+ * VT_UNICODE should better be VT_CHARSET
*/
- if (*buf == '"') {
+ if (r->type & VT_UNICODE) {
+ char *p;
+ p = strchr(r->name, ':');
+ assert(p);
+ *p = 0;
+ charset = r->name;
+ buf = p + 2; /* past delim */
+ s = string_unescape_cstring(interpreter, buf, '"', charset);
+ }
+ else if (*buf == '"') {
buf++;
- if (r->type & VT_UNICODE) {
- /*
- * not really a charset but our reprensentation
- */
- charset = "iso-8859-1"; /* still begin with ascii */
- }
s = string_unescape_cstring(interpreter, buf, '"', charset);
}
else if (*buf == '\'') { /* TODO handle python raw strings */
1.59 +1 -0 parrot/imcc/symreg.c
Index: symreg.c
===================================================================
RCS file: /cvs/public/parrot/imcc/symreg.c,v
retrieving revision 1.58
retrieving revision 1.59
diff -u -r1.58 -r1.59
--- symreg.c 30 Nov 2004 09:35:11 -0000 1.58
+++ symreg.c 28 Feb 2005 10:41:18 -0000 1.59
@@no-spam -288,6 +288,7 @@no-spam
SymReg * r = _mk_symreg(hsh, name, t);
r->type = VTCONST;
if (t == 'U') {
+ /* charset:"string" */
r->set = 'S';
r->type |= VT_UNICODE;
}
1.6 +2 -1 parrot/include/parrot/charset.h
Index: charset.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/charset.h,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- charset.h 27 Feb 2005 11:03:40 -0000 1.5
+++ charset.h 28 Feb 2005 10:41:19 -0000 1.6
@@no-spam -1,7 +1,7 @@no-spam
/* charset.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: charset.h,v 1.5 2005/02/27 11:03:40 leo Exp $
+ * $Id: charset.h,v 1.6 2005/02/28 10:41:19 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding
* Data Structure and Algorithms:
@@no-spam -25,6 +25,7 @@no-spam
extern CHARSET *Parrot_binary_charset_ptr;
extern CHARSET *Parrot_default_charset_ptr;
extern CHARSET *Parrot_unicode_charset_ptr;
+extern CHARSET *Parrot_ascii_charset_ptr;
#endif
#define PARROT_DEFAULT_CHARSET Parrot_iso_8859_1_charset_ptr
1.5 +6 -1 parrot/src/charset.c
Index: charset.c
===================================================================
RCS file: /cvs/public/parrot/src/charset.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- charset.c 27 Feb 2005 11:03:43 -0000 1.4
+++ charset.c 28 Feb 2005 10:41:20 -0000 1.5
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: charset.c,v 1.4 2005/02/27 11:03:43 leo Exp $
+$Id: charset.c,v 1.5 2005/02/28 10:41:20 leo Exp $
=head1 NAME
@@no-spam -19,6 +19,7 @@no-spam
CHARSET *Parrot_binary_charset_ptr;
CHARSET *Parrot_default_charset_ptr;
CHARSET *Parrot_unicode_charset_ptr;
+CHARSET *Parrot_ascii_charset_ptr;
CHARSET *
Parrot_new_charset(Interp *interpreter)
@@no-spam -68,6 +69,10 @@no-spam
Parrot_unicode_charset_ptr = charset;
return 1;
}
+ if (!strcmp("ascii", charsetname)) {
+ Parrot_ascii_charset_ptr = charset;
+ return 1;
+ }
return 0;
}
1.234 +23 -14 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.233
retrieving revision 1.234
diff -u -r1.233 -r1.234
--- string.c 27 Feb 2005 13:19:51 -0000 1.233
+++ string.c 28 Feb 2005 10:41:20 -0000 1.234
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.233 2005/02/27 13:19:51 leo Exp $
+$Id: string.c,v 1.234 2005/02/28 10:41:20 leo Exp $
=head1 NAME
@@no-spam -588,20 +588,20 @@no-spam
=item C<STRING *
string_make(Interp *interpreter, const void *buffer,
- UINTVAL len, const char *encoding_name, UINTVAL flags)>
+ UINTVAL len, const char *charset, UINTVAL flags)>
Creates and returns a new Parrot string using C<len> bytes of string
data read from C<buffer>.
-The value of C<encoding_name> specifies the string's representation.
+The value of C<charset> specifies the string's representation.
The currently recognised values are:
- 'iso-8859-1' = enum_stringrep_one
- 'ucs-2' = enum_stringrep_two
- 'utf-32' = enum_stringrep_four
+ 'iso-8859-1'
+ 'ascii'
+ 'binary'
-If C<encoding_name> is unspecified the the string reperesentation will default
-to C<enum_stringrep_unknown>.
+If C<charset> is unspecified the default charset 'iso-8859-1' will be
+used.
The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
together.
@@no-spam -612,21 +612,30 @@no-spam
STRING *
string_make(Interp *interpreter, const void *buffer,
- UINTVAL len, const char *encoding_name, UINTVAL flags)
+ UINTVAL len, const char *charset_name, UINTVAL flags)
{
ENCODING *encoding;
CHARSET *charset;
- if (!encoding_name) {
+ if (!charset_name) {
internal_exception(MISSING_ENCODING_NAME,
- "string_make: no encoding name specified");
+ "string_make: no charset name specified");
}
- if (strcmp(encoding_name, "iso-8859-1") == 0 ) {
+ if (strcmp(charset_name, "iso-8859-1") == 0 ) {
encoding = Parrot_fixed_8_encoding_ptr;
charset = Parrot_iso_8859_1_charset_ptr;
}
+ else if (strcmp(charset_name, "ascii") == 0 ) {
+ encoding = Parrot_fixed_8_encoding_ptr;
+ charset = Parrot_ascii_charset_ptr;
+ }
+ else if (strcmp(charset_name, "binary") == 0 ) {
+ encoding = Parrot_fixed_8_encoding_ptr;
+ charset = Parrot_binary_charset_ptr;
+ }
else {
- internal_exception(UNIMPLEMENTED, "Can't make non-iso-8859-1 strings");
+ internal_exception(UNIMPLEMENTED,
+ "Can't make '%s' charset strings", charset_name);
}
return string_make_direct(interpreter, buffer, len,
encoding, charset, flags);
@@no-spam -644,7 +653,7 @@no-spam
} __ptr_u;
/* PIO_eprintf(NULL, "string_make(): length = %ld, encoding name = %s, buffer = %s\n",
- len, encoding_name, (const char *)buffer); */
+ len, charset, (const char *)buffer); */
if (len && !buffer) {
internal_exception(BAD_BUFFER_SIZE,