PERL CVS PARROT 27 CVS COMMIT PARROT T SRC STRING T
Date: 27 Feb 2005 09:58:52 -0000

Subject: cvs commit: parrot/t/src string.t
From: leo@no-spam (Leopold Toetsch)

cvsuser 05/02/27 01:58:51

Modified: charset ascii.c ascii.h binary.c binary.h iso-8859-1.c iso-8859-1.h docs configuration.pod encodings fixed_8.c fixed_8.h include/parrot charset.h encoding.h misc.h pobj.h string_funcs.h string_primitives.h io io.c io_buf.c lib/Parrot/Configure Step.pm pf pf_items.c src charset.c encoding.c global.c global_setup.c objects.c pmc.c string.c string_primitives.c utils.c t/native_pbc string.t t/op string.t stringu.t t/pmc perlstring.t string.t t/src string.t Log:
the big string patch 1
Thanks to Will Coleda Dan's string patch is merged now * moved encoding, charset init to string_init * fixed one c99-ism * saneify_string() is a macro now, checking encoding, charset * disabled native_pbc/string.t * cast and reverse order of fixed_8.c:bytes() ptrs.
Revision Changes Path 1.5 +67 -4 parrot/charset/ascii.c Index: ascii.c ===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- ascii.c 5 Nov 2004 08:46:02 -0000 1.4
+++ ascii.c 27 Feb 2005 09:58:40 -0000 1.5
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.4 2004/11/05 08:46:02 leo Exp $
+$Id: ascii.c,v 1.5 2005/02/27 09:58:40 leo Exp $
=head1 NAME @@no-spam -20,6 +20,26 @@no-spam /* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
+static char typetable[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160-175 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176-191 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192-207 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 207-223 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224-239 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240-255 */
+};
+
+
static STRING *get_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count) {

return ENCODING_GET_BYTES(interpreter, source_string, offset, count);
}
@@no-spam -29,6 +49,10 @@no-spam }
+static STRING *get_graphemes_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count) {

+ return ENCODING_GET_BYTES_INPLACE(interpreter, source_string, offset, count, dest_string);

+}
+
static void to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset) {

internal_exception(UNIMPLEMENTED, "to_charset for ascii not implemented");

}
@@no-spam -43,6 +67,14 @@no-spam internal_exception(UNIMPLEMENTED, "to_unicode for ascii not implemented");

}
+static void from_charset(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
+static void from_unicode(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
/* A noop. can't compose ascii */
static void compose(Interp *interpreter, STRING *source_string) {
}
@@no-spam -127,7 +159,7 @@no-spam return 0;
}
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset) {

+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset) {

return -1;
}
@@no-spam -205,15 +237,38 @@no-spam return -1;
}
+static STRING *string_from_codepoint(Interp *interpreter, UINTVAL codepoint) {

+ STRING *return_string = NULL;
+ char real_codepoint = codepoint;
+ return_string = string_make(interpreter, &real_codepoint, 1, "ascii", 0);

+ return return_string;
+}
+
+static size_t compute_hash(Interp *interpreter, STRING *source_string) {
+ size_t hashval;
+
+ char *buffptr = (char *)source_string->strstart;
+ UINTVAL len = source_string->strlen; +
+ while (len--) { + hashval += hashval << 5;
+ hashval += *buffptr++;
+ }
+ return hashval;
+}
+
CHARSET *Parrot_charset_ascii_init(Interp *interpreter) {
CHARSET *return_set = Parrot_new_charset(interpreter);
CHARSET base_set = {
"ascii",
get_graphemes,
+ get_graphemes_inplace,
set_graphemes,
to_charset,
copy_to_charset,
to_unicode,
+ from_charset,
+ from_unicode,
compose,
decompose,
upcase,
@@no-spam -241,12 +296,20 @@no-spam is_newline,
find_newline,
find_not_newline,
- find_word_boundary + find_word_boundary,
+ string_from_codepoint,
+ compute_hash,
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}

};
- preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8");
+ /* Snag the global. This is... bad. Should be properly fixed at some + point */
+ preferred_encoding = Parrot_fixed_8_encoding_ptr;
+
+/* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
memcpy(return_set, &base_set, sizeof(CHARSET));
+ Parrot_register_charset(interpreter, "ascii", return_set);
return return_set;
}
1.5 +4 -2 parrot/charset/ascii.h Index: ascii.h ===================================================================
RCS file: /cvs/public/parrot/charset/ascii.h,v retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- ascii.h 5 Nov 2004 08:46:02 -0000 1.4
+++ ascii.h 27 Feb 2005 09:58:40 -0000 1.5
@@no-spam -1,7 +1,7 @@no-spam /* ascii.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: ascii.h,v 1.4 2004/11/05 08:46:02 leo Exp $
+ * $Id: ascii.h,v 1.5 2005/02/27 09:58:40 leo Exp $
* Overview:
* This is the header for the ascii charset functions * Data Structure and Algorithms:
@@no-spam -14,6 +14,7 @@no-spam #define PARROT_CHARSET_ASCII_H_GUARD static STRING *get_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+static STRING *get_graphemes_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);

static void set_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string);

static void to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset);

static STRING *copy_to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset);

@@no-spam -27,7 +28,7 @@no-spam static void downcase_first(Interp *interpreter, STRING *source_string);
static void titlecase_first(Interp *interpreter, STRING *source_string);
static INTVAL compare(Interp *interpreter, STRING *lhs, STRING *rhs);
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset);

static INTVAL cs_rindex(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

static UINTVAL validate(Interp *interpreter, STRING *source_string);
static INTVAL is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset);

@@no-spam -46,6 +47,7 @@no-spam static INTVAL find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset);

static INTVAL find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset);

static INTVAL find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL offset);

+static size_t compute_hash(Interp *interpreter, STRING *source_string);
CHARSET *Parrot_charset_ascii_init(Interp *interpreter);
1.6 +49 -4 parrot/charset/binary.c Index: binary.c ===================================================================
RCS file: /cvs/public/parrot/charset/binary.c,v retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- binary.c 5 Nov 2004 08:46:02 -0000 1.5
+++ binary.c 27 Feb 2005 09:58:40 -0000 1.6
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: binary.c,v 1.5 2004/11/05 08:46:02 leo Exp $
+$Id: binary.c,v 1.6 2005/02/27 09:58:40 leo Exp $
=head1 NAME @@no-spam -24,6 +24,10 @@no-spam return ENCODING_GET_BYTES(interpreter, source_string, offset, count);
}
+static STRING *get_graphemes_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count) {

+ return ENCODING_GET_BYTES_INPLACE(interpreter, source_string, offset, count, dest_string);

+}
+
static void set_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string) {

ENCODING_SET_BYTES(interpreter, source_string, offset, replace_count, insert_string);

}
@@no-spam -43,6 +47,14 @@no-spam internal_exception(UNIMPLEMENTED, "to_unicode for binary not implemented");

}
+static void from_charset(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
+static void from_unicode(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
/* A noop. can't compose binary */
static void compose(Interp *interpreter, STRING *source_string) {
}
@@no-spam -79,7 +91,7 @@no-spam return 0;
}
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset) {

+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset) {

return -1;
}
@@no-spam -157,15 +169,39 @@no-spam return -1;
}
+static STRING *string_from_codepoint(Interp *interpreter, UINTVAL codepoint) {

+ STRING *return_string = NULL;
+ char real_codepoint = codepoint;
+ return_string = string_make(interpreter, &real_codepoint, 1, "binary", 0);

+ return return_string;
+}
+
+static size_t compute_hash(Interp *interpreter, STRING *source_string) {
+ size_t hashval;
+
+ char *buffptr = (char *)source_string->strstart;
+ UINTVAL len = source_string->strlen; +
+ while (len--) { + hashval += hashval << 5;
+ hashval += *buffptr++;
+ }
+ return hashval;
+}
+
+
CHARSET *Parrot_charset_binary_init(Interp *interpreter) {
CHARSET *return_set = Parrot_new_charset(interpreter);
CHARSET base_set = {
"binary",
get_graphemes,
+ get_graphemes_inplace,
set_graphemes,
to_charset,
copy_to_charset,
to_unicode,
+ from_charset,
+ from_unicode,
compose,
decompose,
upcase,
@@no-spam -193,12 +229,21 @@no-spam is_newline,
find_newline,
find_not_newline,
- find_word_boundary + find_word_boundary,
+ string_from_codepoint,
+ compute_hash,
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}

};
- preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8");
+ /* Snag the global. This is... bad. Should be properly fixed at some + point */
+ preferred_encoding = Parrot_fixed_8_encoding_ptr;
+
+/* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
+ memcpy(return_set, &base_set, sizeof(CHARSET));
+ Parrot_register_charset(interpreter, "binary", return_set);
return return_set;
}
1.4 +2 -2 parrot/charset/binary.h Index: binary.h ===================================================================
RCS file: /cvs/public/parrot/charset/binary.h,v retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- binary.h 5 Nov 2004 08:46:02 -0000 1.3
+++ binary.h 27 Feb 2005 09:58:40 -0000 1.4
@@no-spam -1,7 +1,7 @@no-spam /* binary.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: binary.h,v 1.3 2004/11/05 08:46:02 leo Exp $
+ * $Id: binary.h,v 1.4 2005/02/27 09:58:40 leo Exp $
* Overview:
* This is the header for the binary charset functions * Data Structure and Algorithms:
@@no-spam -27,7 +27,7 @@no-spam static void downcase_first(Interp *interpreter, STRING *source_string);
static void titlecase_first(Interp *interpreter, STRING *source_string);
static INTVAL compare(Interp *interpreter, STRING *lhs, STRING *rhs);
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset);

static INTVAL cs_rindex(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

static UINTVAL validate(Interp *interpreter, STRING *source_string);
static INTVAL is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset);

1.3 +161 -21 parrot/charset/iso-8859-1.c Index: iso-8859-1.c ===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- iso-8859-1.c 5 Nov 2004 08:46:02 -0000 1.2
+++ iso-8859-1.c 27 Feb 2005 09:58:40 -0000 1.3
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.2 2004/11/05 08:46:02 leo Exp $
+$Id: iso-8859-1.c,v 1.3 2005/02/27 09:58:40 leo Exp $
=head1 NAME @@no-spam -20,17 +20,91 @@no-spam /* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
+#define WHITESPACE 1
+#define WORDCHAR 2
+#define PUNCTUATION 3
+#define DIGIT 4
+
+static char typetable[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 160-175 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, /* 176-191 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 192-207 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 207-223 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224-239 */
+ 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 240-255 */
+};
+
+static INTVAL find_thing(Interp *interpreter, STRING *string, UINTVAL start, UINTVAL type) {

+ INTVAL retval = -1;
+ UINTVAL offset = start;
+ INTVAL found = 0;
+ for (; offset < string->strlen; offset++) {
+ if (typetable[ENCODING_GET_CODEPOINT(interpreter, string, offset)] == type) {

+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ retval = offset;
+ }
+ return retval;
+}
+
+static INTVAL find_not_thing(Interp *interpreter, STRING *string, UINTVAL start, UINTVAL type) {

+ INTVAL retval = -1;
+ UINTVAL offset = start;
+ INTVAL found = 0;
+ for (; offset < string->strlen; offset++) {
+ if (typetable[ENCODING_GET_CODEPOINT(interpreter, string, offset)] != type) {

+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ retval = offset;
+ }
+ return retval;
+}
+
static STRING *get_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count) {

return ENCODING_GET_BYTES(interpreter, source_string, offset, count);
}
+static STRING *get_graphemes_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count) {

+ return ENCODING_GET_BYTES_INPLACE(interpreter, source_string, offset, count, dest_string);

+}
+
static void set_graphemes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string) {

ENCODING_SET_BYTES(interpreter, source_string, offset, replace_count, insert_string);

}
+static void from_charset(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
+static void from_unicode(Interp *interpreter, STRING *source_string) {
+ internal_exception(UNIMPLEMENTED, "Can't do this yet");
+}
+
+
static void to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset) {

- internal_exception(UNIMPLEMENTED, "to_charset for iso-8859-1 not implemented");

+ void *conversion_func;
+ if ((conversion_func = Parrot_find_charset_converter(interpreter, source_string->charset, new_charset))) {

+ } else {
+ to_unicode(interpreter, source_string);
+ new_charset->from_charset(interpreter, source_string);
+ }
}
static STRING *copy_to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset) {

@@no-spam -124,15 +198,38 @@no-spam }
static INTVAL compare(Interp *interpreter, STRING *lhs, STRING *rhs) {
- return 0;
+ INTVAL retval = memcmp(lhs->strstart, rhs->strstart, lhs->strlen);
+ if (!retval && lhs->strlen < rhs->strlen) {
+ retval = -1;
+ }
+ if (retval) {
+ retval = retval > 0 ? 1 : -1;
+ }
+ return retval;
}
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset) {

- return -1;
+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset) {

+ UINTVAL base_size, search_size;
+ char *base, *search;
+ INTVAL retval;
+ if (source_string->charset != search_string->charset) {
+ internal_exception(UNIMPLEMENTED, "Cross-charset index not supported");

+ }
+ + retval = Parrot_byte_index(interpreter, source_string, search_string, offset);

+ return retval;
}
static INTVAL cs_rindex(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset) {

- return -1;
+ UINTVAL base_size, search_size;
+ char *base, *search;
+ INTVAL retval;
+ if (source_string->charset != search_string->charset) {
+ internal_exception(UNIMPLEMENTED, "Cross-charset index not supported");

+ }
+ + retval = Parrot_byte_rindex(interpreter, source_string, search_string, offset);

+ return retval;
}
/* Binary's always valid */
@@no-spam -142,55 +239,66 @@no-spam /* No word chars in binary data */
static INTVAL is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == WORDCHAR);
}
static INTVAL find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return -1;
+ return find_thing(interpreter, source_string, offset, WORDCHAR);
}
static INTVAL find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return offset;
+ return find_not_thing(interpreter, source_string, offset, WORDCHAR);
}
static INTVAL is_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == WHITESPACE);
}
static INTVAL find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return -1;
+ return find_thing(interpreter, source_string, offset, WHITESPACE);
}
static INTVAL find_not_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return offset;
+ return find_not_thing(interpreter, source_string, offset, WHITESPACE);
}
static INTVAL is_digit(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == DIGIT);
}
static INTVAL find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return -1;
+ return find_thing(interpreter, source_string, offset, DIGIT);
}
static INTVAL find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return offset;
+ return find_not_thing(interpreter, source_string, offset, DIGIT);
}
static INTVAL is_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == PUNCTUATION);
}
static INTVAL find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return -1;
+ return find_thing(interpreter, source_string, offset, PUNCTUATION);
}
static INTVAL find_not_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return offset;
+ return find_not_thing(interpreter, source_string, offset, PUNCTUATION);
+
}
static INTVAL is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset) {

- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return codepoint == 13;
}
static INTVAL find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset) {

@@no-spam -205,15 +313,38 @@no-spam return -1;
}
+static STRING *string_from_codepoint(Interp *interpreter, UINTVAL codepoint) {

+ STRING *return_string = NULL;
+ char real_codepoint = codepoint;
+ return_string = string_make(interpreter, &real_codepoint, 1, "iso-8859-1", 0);

+ return return_string;
+}
+
+static size_t compute_hash(Interp *interpreter, STRING *source_string) {
+ size_t hashval = 0;
+
+ char *buffptr = (char *)source_string->strstart;
+ UINTVAL len = source_string->strlen; +
+ while (len--) { + hashval += hashval << 5;
+ hashval += *buffptr++;
+ }
+ return hashval;
+}
+
CHARSET *Parrot_charset_iso_8859_1_init(Interp *interpreter) {
CHARSET *return_set = Parrot_new_charset(interpreter);
CHARSET base_set = {
"iso-8859-1",
get_graphemes,
+ get_graphemes_inplace,
set_graphemes,
to_charset,
copy_to_charset,
to_unicode,
+ from_charset,
+ from_unicode,
compose,
decompose,
upcase,
@@no-spam -241,12 +372,21 @@no-spam is_newline,
find_newline,
find_not_newline,
- find_word_boundary + find_word_boundary,
+ string_from_codepoint,
+ compute_hash,
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}

+
};
- preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8");
+ /* Snag the global. This is... bad. Should be properly fixed at some + point */
+ preferred_encoding = Parrot_fixed_8_encoding_ptr;
+
+/* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
memcpy(return_set, &base_set, sizeof(CHARSET));
+ Parrot_register_charset(interpreter, "iso-8859-1", return_set);
return return_set;
}
1.3 +3 -2 parrot/charset/iso-8859-1.h Index: iso-8859-1.h ===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- iso-8859-1.h 5 Nov 2004 08:46:02 -0000 1.2
+++ iso-8859-1.h 27 Feb 2005 09:58:40 -0000 1.3
@@no-spam -1,7 +1,7 @@no-spam /* iso_8859_1.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: iso-8859-1.h,v 1.2 2004/11/05 08:46:02 leo Exp $
+ * $Id: iso-8859-1.h,v 1.3 2005/02/27 09:58:40 leo Exp $
* Overview:
* This is the header for the iso_8859-1 charset functions * Data Structure and Algorithms:
@@no-spam -27,7 +27,7 @@no-spam static void downcase_first(Interp *interpreter, STRING *source_string);
static void titlecase_first(Interp *interpreter, STRING *source_string);
static INTVAL compare(Interp *interpreter, STRING *lhs, STRING *rhs);
-static INTVAL cs_index(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

+static INTVAL cs_index(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset);

static INTVAL cs_rindex(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

static UINTVAL validate(Interp *interpreter, STRING *source_string);
static INTVAL is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset);

@@no-spam -46,6 +46,7 @@no-spam static INTVAL find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset);

static INTVAL find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset);

static INTVAL find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL offset);

+static size_t compute_hash(Interp *interpreter, STRING *source_string);
CHARSET *Parrot_charset_iso_8859_1_init(Interp *interpreter);
1.6 +2 -3 parrot/docs/configuration.pod Index: configuration.pod ===================================================================
RCS file: /cvs/public/parrot/docs/configuration.pod,v retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- configuration.pod 30 Apr 2004 06:03:33 -0000 1.5
+++ configuration.pod 27 Feb 2005 09:58:42 -0000 1.6
@@no-spam -1,5 +1,5 @@no-spam # Copyright: 2004 The Perl Foundation. All Rights Reserved.
-# $Id: configuration.pod,v 1.5 2004/04/30 06:03:33 leo Exp $
+# $Id: configuration.pod,v 1.6 2005/02/27 09:58:42 leo Exp $
=head1 NAME @@no-spam -225,7 +225,7 @@no-spam =item C< HAS_(\w+) > features -defines PARROT_HAS_XX in F<include/parrot/has_header.h>
+defines PARROT_HAS_XXX in F<include/parrot/has_header.h>
=item C< TEMP_(\w+) > temporary settings @@no-spam -234,7 +234,6 @@no-spam =back -
=head1 SEE ALSO =over 4
1.6 +54 -10 parrot/encodings/fixed_8.c Index: fixed_8.c ===================================================================
RCS file: /cvs/public/parrot/encodings/fixed_8.c,v retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- fixed_8.c 5 Nov 2004 08:46:03 -0000 1.5
+++ fixed_8.c 27 Feb 2005 09:58:42 -0000 1.6
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: fixed_8.c,v 1.5 2004/11/05 08:46:03 leo Exp $
+$Id: fixed_8.c,v 1.6 2005/02/27 09:58:42 leo Exp $
=head1 NAME @@no-spam -29,7 +29,7 @@no-spam }
/* codepoints are bytes, so delegate */
-static UINTVAL get_codepoint(Interp *interpreter, STRING *source_string, UINTVAL offset) {

+static UINTVAL get_codepoint(Interp *interpreter, const STRING *source_string, UINTVAL offset) {

return get_byte(interpreter, source_string, offset);
}
@@no-spam -38,25 +38,63 @@no-spam set_byte(interpreter, source_string, offset, codepoint);
}
-static UINTVAL get_byte(Interp *interpreter, STRING *source_string, UINTVAL offset) {

+static UINTVAL get_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset) {

char *contents = source_string->strstart;
if (offset >= source_string->bufused) {
- internal_exception(0, "get_byte past the end of the buffer");
+ assert(offset < source_string->bufused);
+ internal_exception(0, "get_byte past the end of the buffer (%i of %i)", offset, source_string->bufused);

}
return contents[offset];
}
-static void set_byte(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL byte) {

+static void set_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset, UINTVAL byte) {

+ char *contents;
+ if (offset >= source_string->bufused) {
+ internal_exception(0, "set_byte past the end of the buffer");
+ }
+ contents = source_string->strstart;
+ contents[offset] = byte;
+
}
/* Delegate to get_bytes */
static STRING *get_codepoints(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count) {

-
- return get_bytes(interpreter, source_string, offset, count);
+ STRING *return_string = get_bytes(interpreter, source_string, offset, count);

+ return_string->charset = source_string->charset;
+ return return_string;
}
static STRING *get_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count) {

- STRING *return_string = NULL;
+ STRING *return_string = Parrot_make_COW_reference(interpreter, source_string);

+ return_string->encoding = source_string->encoding;
+ return_string->charset = source_string->charset;
+
+ return_string->strstart = (char *)return_string->strstart + offset ;
+ return_string->bufused = count;
+
+ return_string->strlen = count;
+ return_string->hashval = 0;
+
+ return return_string;
+}
+
+
+/* Delegate to get_bytes */
+static STRING *get_codepoints_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count) {

+
+ return get_bytes_inplace(interpreter, source_string, offset, count, dest_string);

+}
+
+static STRING *get_bytes_inplace(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *return_string) {

+ Parrot_reuse_COW_reference(interpreter, source_string, return_string);
+ return_string->encoding = source_string->encoding;
+ return_string->charset = source_string->charset;
+
+ return_string->strstart = (char *)return_string->strstart + offset ;
+ return_string->bufused = count;
+
+ return_string->strlen = count;
+ return_string->hashval = 0;
return return_string;
}
@@no-spam -67,6 +105,7 @@no-spam }
static void set_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_bytes) {

+ string_replace(interpreter, source_string, offset, count, new_bytes, NULL);

}
/* Unconditionally makes the string be in this encoding, if that's @@no-spam -76,11 +115,12 @@no-spam static UINTVAL codepoints(Interp *interpreter, STRING *source_string) {
- return source_string->bufused;
+ return bytes(interpreter, source_string);
}
static UINTVAL bytes(Interp *interpreter, STRING *source_string) {
- return source_string->bufused;
+ return source_string->bufused -
+ ((char *)source_string->strstart - (char *)PObj_bufstart(source_string));

}
ENCODING *Parrot_encoding_fixed_8_init(Interp *interpreter) {
@@no-spam -88,6 +128,7 @@no-spam ENCODING base_encoding = {
"fixed_8",
+ 1, /* Max bytes per codepoint */
to_encoding,
copy_to_encoding,
get_codepoint,
@@no-spam -95,7 +136,9 @@no-spam get_byte,
set_byte,
get_codepoints,
+ get_codepoints_inplace,
get_bytes,
+ get_bytes_inplace,
set_codepoints,
set_bytes,
become_encoding,
@@no-spam -103,6 +146,7 @@no-spam bytes };
memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+ Parrot_register_encoding(interpreter, "fixed_8", return_encoding);
return return_encoding;
}
1.3 +6 -4 parrot/encodings/fixed_8.h Index: fixed_8.h ===================================================================
RCS file: /cvs/public/parrot/encodings/fixed_8.h,v retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- fixed_8.h 3 Nov 2004 17:37:43 -0000 1.2
+++ fixed_8.h 27 Feb 2005 09:58:42 -0000 1.3
@@no-spam -1,7 +1,7 @@no-spam /* fixed_8.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: fixed_8.h,v 1.2 2004/11/03 17:37:43 dan Exp $
+ * $Id: fixed_8.h,v 1.3 2005/02/27 09:58:42 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding * Data Structure and Algorithms:
@@no-spam -15,12 +15,14 @@no-spam static void to_encoding(Interp *interpreter, STRING *source_string);
static STRING *copy_to_encoding(Interp *interpreter, STRING *source_string);
-static UINTVAL get_codepoint(Interp *interpreter, STRING *source_string, UINTVAL offset);

+static UINTVAL get_codepoint(Interp *interpreter, const STRING *source_string, UINTVAL offset);

static void set_codepoint(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL codepoint);

-static UINTVAL get_byte(Interp *interpreter, STRING *source_string, UINTVAL offset);

-static void set_byte(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL byte);

+static UINTVAL get_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset);

+static void set_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset, UINTVAL byte);

static STRING *get_codepoints(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+static STRING *get_codepoints_inplace(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);

static STRING *get_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+static STRING *get_bytes_inplace(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *dest_string);

static void set_codepoints(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_codepoints);

static void set_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_bytes);

static void become_encoding(Interp *interpreter, STRING *source_string);
1.4 +84 -3 parrot/include/parrot/charset.h Index: charset.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/charset.h,v retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- charset.h 3 Nov 2004 19:22:24 -0000 1.3
+++ charset.h 27 Feb 2005 09:58:43 -0000 1.4
@@no-spam -1,7 +1,7 @@no-spam /* charset.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: charset.h,v 1.3 2004/11/03 19:22:24 dan Exp $
+ * $Id: charset.h,v 1.4 2005/02/27 09:58:43 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding * Data Structure and Algorithms:
@@no-spam -13,16 +13,32 @@no-spam #if !defined(PARROT_CHARSET_H_GUARD)
#define PARROT_CHARSET_H_GUARD +
#include "parrot/encoding.h"
-struct _struct;
+struct _charset;
typedef struct _charset CHARSET;
+
+#if !defined PARROT_NO_EXTERN_CHARSET_PTRS +extern CHARSET *Parrot_iso_8859_1_charset_ptr;
+extern CHARSET *Parrot_binary_charset_ptr;
+extern CHARSET *Parrot_default_charset_ptr;
+extern CHARSET *Parrot_unicode_charset_ptr;
+#endif +
+#define PARROT_DEFAULT_CHARSET Parrot_iso_8859_1_charset_ptr +#define PARROT_BINARY_CHARSET Parrot_binary_charset +#define PARROT_UNICODE_CHARSET Parrot_unicode_charset_ptr +
typedef STRING *(*charset_get_graphemes_t)(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+typedef STRING *(*charset_get_graphemes_inplace_t)(Interp *interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);

typedef void (*charset_set_graphemes_t)(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string);

typedef void (*charset_to_charset_t)(Interp *interpreter, STRING *source_string, CHARSET *new_charset);

typedef STRING *(*charset_copy_to_charset_t)(Interp *interpreter, STRING *source_string, CHARSET *new_charset);

typedef void (*charset_to_unicode_t)(Interp *interpreter, STRING *source_string);

+typedef void (*charset_from_charset_t)(Interp *interpreter, STRING *source_string);

+typedef void (*charset_from_unicode_t)(Interp *interpreter, STRING *source_string);

typedef void (*charset_compose_t)(Interp *interpreter, STRING *source_string);

typedef void (*charset_decompose_t)(Interp *interpreter, STRING *source_string);

typedef void (*charset_upcase_t)(Interp *interpreter, STRING *source_string);

@@no-spam -32,7 +48,7 @@no-spam typedef void (*charset_downcase_first_t)(Interp *interpreter, STRING *source_string);

typedef void (*charset_titlecase_first_t)(Interp *interpreter, STRING *source_string);

typedef INTVAL (*charset_compare_t)(Interp *interpreter, STRING *lhs, STRING *rhs);

-typedef INTVAL (*charset_index_t)(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

+typedef INTVAL (*charset_index_t)(Interp *interpreter, const STRING *source_string, const STRING *search_string, UINTVAL offset);

typedef INTVAL (*charset_rindex_t)(Interp *interpreter, STRING *source_string, STRING *search_string, UINTVAL offset);

typedef UINTVAL (*charset_validate_t)(Interp *interpreter, STRING *source_string);

typedef INTVAL (*charset_is_wordchar_t)(Interp *interpreter, STRING *source_string, UINTVAL offset);

@@no-spam -51,16 +67,28 @@no-spam typedef INTVAL (*charset_find_newline_t)(Interp *interpreter, STRING *source_string, UINTVAL offset);

typedef INTVAL (*charset_find_not_newline_t)(Interp *interpreter, STRING *source_string, UINTVAL offset);

typedef INTVAL (*charset_find_word_boundary_t)(Interp *interpreter, STRING *source_string, UINTVAL offset);

+typedef STRING *(*charset_string_from_codepoint_t)(Interp *interpreter, UINTVAL codepoint);

+typedef size_t (*charset_compute_hash_t)(Interp *interpreter, STRING *source_string);

CHARSET *Parrot_new_charset(Interp *interpreter);
+CHARSET *Parrot_load_charset(Interp *interpreter, const char *charsetname);
+CHARSET *Parrot_find_charset(Interp *interpreter, const char *charsetname);
+INTVAL Parrot_register_charset(Interp *interpreter, const char *charsetname, CHARSET *charset);

+INTVAL Parrot_make_default_charset(Interp *interpreter, const char *charsetname, CHARSET *charset);

+CHARSET *Parrot_default_charset(Interp *interpreter);
+typedef INTVAL (*charset_converter_t)(Interp *interpreter, CHARSET *lhs, CHARSET *rhs);

+charset_converter_t Parrot_find_charset_converter(Interp *interpreter, CHARSET *lhs, CHARSET *rhs);

struct _charset {
const char *name;
charset_get_graphemes_t get_graphemes;
+ charset_get_graphemes_inplace_t get_graphemes_inplace;
charset_set_graphemes_t set_graphemes;
charset_to_charset_t to_charset;
charset_copy_to_charset_t copy_to_charset;
charset_to_unicode_t to_unicode;
+ charset_from_charset_t from_charset;
+ charset_from_unicode_t from_unicode;
charset_compose_t compose;
charset_decompose_t decompose;
charset_upcase_t upcase;
@@no-spam -89,8 +117,61 @@no-spam charset_find_newline_t find_newline;
charset_find_not_newline_t find_not_newline;
charset_find_word_boundary_t find_word_boundary;
+ charset_string_from_codepoint_t string_from_codepoint;
+ charset_compute_hash_t compute_hash;
+ ENCODING encoding_overrides;
};
+#define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET *)source->charset)->get_graphemes(interpreter, source, offset, count)

+#define CHARSET_GET_GRAPHEMES_INPLACE(interp, source, dest, offset, count) ((CHARSET *)source->charset)->get_graphemes(interpreter, source, dest, offset, count)

+#define CHARSET_SET_GRAPHEMES(interp, source, offset, replace_count, insert) ((CHARSET *)source->charset)->set_graphemes(interpreter, source, offset, replace_count, insert)

+#define CHARSET_TO_CHARSET(interp, source, new_charset) ((CHARSET *)source->charset)->to_charset(interpreter, source, new_charset)

+#define CHARSET_COPY_TO_CHARSET(interp, source, new_charset) ((CHARSET *)source->charset)->copy_to_charset(interpreter, source, new_charset)

+#define CHARSET_TO_UNICODE(interp, source) ((CHARSET *)source->charset)->to_unicode(interpreter, source)

+#define CHARSET_COMPOSE(interp, source) ((CHARSET *)source->charset)->compose(interpreter, source)

+#define CHARSET_DECOMPOSE(interp, source) ((CHARSET *)source->charset)->decompose(interpreter, source)

+#define CHARSET_UPCASE(interp, source) ((CHARSET *)source->charset)->upcase(interpreter, source)

+#define CHARSET_DOWNCASE(interp, source) ((CHARSET *)source->charset)->downcase(interpreter, source)

+#define CHARSET_TITLECASE(interp, source) ((CHARSET *)source->charset)->titlecase(interpreter, source)

+#define CHARSET_UPCASE_FIRST(interp, source) ((CHARSET *)source->charset)->upcase_first(interpreter, source)

+#define CHARSET_DOWNCASE_FIRST(interp, source) ((CHARSET *)source->charset)->downcase_first(interpreter, source)

+#define CHARSET_TITLECASE_FIRST(interp, source) ((CHARSET *)source->charset)->titlecase_first(interpreter, source)

+#define CHARSET_COMPARE(interp, lhs, rhs) ((CHARSET *)lhs->charset)->compare(interpreter, lhs, rhs)

+#define CHARSET_INDEX(interp, source, search, offset) ((CHARSET *)source->charset)->index(interpreter, source, search, offset)

+#define CHARSET_RINDEX(interp, source, search, offset) ((CHARSET *)source->charset)->rindex(interpreter, source, search, offset)

+#define CHARSET_VALIDATE(interp, source, offset) ((CHARSET *)source->charset)->validate(interpreter, source)

+#define CHARSET_IS_WORDCHAR(interp, source, offset) ((CHARSET *)source->charset)->is_wordchar(interpreter, source, offset)

+#define CHARSET_FIND_WORDCHAR(interp, source, offset) ((CHARSET *)source->charset)->find_wordchar(interpreter, source, offset)

+#define CHARSET_FIND_NOT_WORDCHAR(interp, source, offset) ((CHARSET *)source->charset)->find_not_wordchar(interpreter, source, offset)

+#define CHARSET_IS_WHITESPACE(interp, source, offset) ((CHARSET *)source->charset)->is_whitespace(interpreter, source, offset)

+#define CHARSET_FIND_WHITESPACE(interp, source, offset) ((CHARSET *)source->charset)->find_whitespace(interpreter, source, offset)

+#define CHARSET_FIND_NOT_WHITESPACE(interp, source, offset) ((CHARSET *)source->charset)->find_not_whitespace(interpreter, source, offset)

+#define CHARSET_IS_DIGIT(interp, source, offset) ((CHARSET *)source->charset)->is_digit(interpreter, source, offset)

+#define CHARSET_FIND_DIGIT(interp, source, offset) ((CHARSET *)source->charset)->find_digit(interpreter, source, offset)

+#define CHARSET_FIND_NOT_DIGIT(interp, source, offset) ((CHARSET *)source->charset)->find_not_digit(interpreter, source, offset)

+#define CHARSET_IS_PUNCTUATION(interp, source, offset) ((CHARSET *)source->charset)->is_punctuation(interpreter, source, offset)

+#define CHARSET_FIND_PUNCTUATION(interp, source, offset) ((CHARSET *)source->charset)->find_punctuation(interpreter, source, offset)

+#define CHARSET_FIND_NOT_PUNCTUATION(interp, source, offset) ((CHARSET *)source->charset)->find_not_punctuation(interpreter, source, offset)

+#define CHARSET_IS_NEWLINE(interp, source, offset) ((CHARSET *)source->charset)->is_newline(interpreter, source, offset)

+#define CHARSET_FIND_NEWLINE(interp, source, offset) ((CHARSET *)source->charset)->find_newline(interpreter, source, offset)

+#define CHARSET_FIND_NOT_NEWLINE(interp, source, offset) ((CHARSET *)source->charset)->find_not_newline(interpreter, source, offset)

+#define CHARSET_FIND_WORD_BOUNDARY(interp, source, offset) ((CHARSET *)source->charset)->find_word_boundary(interpreter, source, offset)

+#define CHARSET_COMPUTE_HASH(interp, source) ((CHARSET *)source->charset)->compute_hash(interpreter, source)

+#define CHARSET_TO_ENCODING(interp, source, offset, count) ((ENCODING *)source->encoding)->to_encoding(interp, source, offset, count)

+#define CHARSET_COPY_TO_ENCODING(interp, source) ((ENCODING *)source->encoding)->copy_to_encoding(interp, source)

+#define CHARSET_GET_CODEPOINT(interp, source, offset) ((ENCODING *)source->encoding)->get_codepoint(interp, source, offset)

+#define CHARSET_SET_CODEPOINT(interp, source, offset, codepoint) ((ENCODING *)source->encoding)->set_codepoint(interp, source, offset, codepoint)

+#define CHARSET_GET_BYTE(interp, source, offset) ((ENCODING *)source->encoding)->get_byte(interp, source, offset)

+#define CHARSET_SET_BYTE(interp, source, offset, value) ((ENCODING *)source->encoding)->set_byte(interp, source, offset, value)

+#define CHARSET_GET_CODEPOINTS(interp, source, offset, count) ((ENCODING *)source->encoding)->get_codepoints(interp, source, offset, count)

+#define CHARSET_GET_CODEPOINTS_INPLACE(interp, source, dest, offset, count) ((ENCODING *)source->encoding)->get_codepoints_inplace(interp, source, dest, offset, count)

+#define CHARSET_GET_BYTES(interp, source, offset, count) ((ENCODING *)source->encoding)->get_bytes(interp, source, offset, count)

+#define CHARSET_GET_BYTES_INPLACE(interp, source, offset, count, dest) ((ENCODING *)source->encoding)->get_bytes(interp, source, offset, count, dest)

+#define CHARSET_SET_CODEPOINTS(interp, source, offset, count, newdata) ((ENCODING *)source->encoding)->set_codepoints(interp, source, offset, count, newdata)

+#define CHARSET_SET_BYTES(interp, source, offset, count, newdata) ((ENCODING *)source->encoding)->set_bytes(interp, source, offset, count, newdata)

+#define CHARSET_BECOME_ENCODING(interp, source) ((ENCODING *)source->encoding)->become_encoding(interp, source)

+#define CHARSET_CODEPOINTS(interp, source) ((ENCODING *)source->encoding)->codepoints(interp, source)

+#define CHARSET_BYTES(interp, source) ((ENCODING *)source->encoding)->bytes(interp, source)

#endif /* PARROT_CHARSET_H_GUARD */
1.30 +27 -4 parrot/include/parrot/encoding.h Index: encoding.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/encoding.h,v retrieving revision 1.29
retrieving revision 1.30
diff -u -r1.29 -r1.30
--- encoding.h 3 Nov 2004 20:58:09 -0000 1.29
+++ encoding.h 27 Feb 2005 09:58:43 -0000 1.30
@@no-spam -1,7 +1,7 @@no-spam /* encoding.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: encoding.h,v 1.29 2004/11/03 20:58:09 dan Exp $
+ * $Id: encoding.h,v 1.30 2005/02/27 09:58:43 leo Exp $
* Overview:
* This is the header for the generic encoding functions * Data Structure and Algorithms:
@@no-spam -17,12 +17,14 @@no-spam typedef void (*encoding_to_encoding_t)(Interp* interpreter, STRING *string);
typedef STRING *(*encoding_copy_to_encoding_t)(Interp* interpreter, STRING *string);

-typedef UINTVAL (*encoding_get_codepoint_t)(Interp* interpreter, STRING *source_string, UINTVAL offset);

+typedef UINTVAL (*encoding_get_codepoint_t)(Interp* interpreter, const STRING *source_string, UINTVAL offset);

typedef void (*encoding_set_codepoint_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL codepoint);

-typedef UINTVAL (*encoding_get_byte_t)(Interp* interpreter, STRING *source_string, UINTVAL offset);

-typedef void (*encoding_set_byte_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+typedef UINTVAL (*encoding_get_byte_t)(Interp* interpreter, const STRING *source_string, UINTVAL offset);

+typedef void (*encoding_set_byte_t)(Interp* interpreter, const STRING *source_string, UINTVAL offset, UINTVAL count);

typedef STRING *(*encoding_get_codepoints_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

typedef STRING *(*encoding_get_bytes_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);

+typedef STRING *(*encoding_get_codepoints_inplace_t)(Interp* interpreter, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);

+typedef STRING *(*encoding_get_bytes_inplace_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *dest_string);

typedef void (*encoding_set_codepoints_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_bytes);

typedef void (*encoding_set_bytes_t)(Interp* interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_bytes);

typedef void (*encoding_become_encoding_t)(Interp* interpreter, STRING *source_string);

@@no-spam -31,6 +33,7 @@no-spam struct _encoding {
const char *name;
+ UINTVAL max_bytes_per_codepoint;
encoding_to_encoding_t to_encoding;
encoding_copy_to_encoding_t copy_to_encoding;
encoding_get_codepoint_t get_codepoint;
@@no-spam -38,7 +41,9 @@no-spam encoding_get_byte_t get_byte;
encoding_set_byte_t set_byte;
encoding_get_codepoints_t get_codepoints;
+ encoding_get_codepoints_inplace_t get_codepoints_inplace;
encoding_get_bytes_t get_bytes;
+ encoding_get_bytes_inplace_t get_bytes_inplace;
encoding_set_codepoints_t set_codepoints;
encoding_set_bytes_t set_bytes;
encoding_become_encoding_t become_encoding;
@@no-spam -48,9 +53,25 @@no-spam typedef struct _encoding ENCODING;
+#if !defined PARROT_NO_EXTERN_ENCODING_PTRS +extern ENCODING *Parrot_fixed_8_encoding_ptr;
+#endif +
+#define PARROT_DEFAULT_ENCODING Parrot_fixed_8_encoding_ptr +#define PARROT_FIXED_8_ENCODING Parrot_fixed_8_encoding_ptr +#define PARROT_DEFAULT_FOR_UNICODE_ENCODING NULL +
ENCODING *Parrot_new_encoding(Interp* interpreter);
ENCODING *Parrot_load_encoding(Interp* interpreter, const char *encoding_name);

+ENCODING *Parrot_find_encoding(Interp *interpreter, const char *encodingname);

+INTVAL Parrot_register_encoding(Interp *interpreter, const char *encodingname, ENCODING *encoding);

+INTVAL Parrot_make_default_encoding(Interp *interpreter, const char *encodingname, ENCODING *encoding);

+ENCODING *Parrot_default_encoding(Interp *interpreter);
+typedef INTVAL (*encoding_converter_t)(Interp *interpreter, ENCODING *lhs, ENCODING *rhs);

+encoding_converter_t Parrot_find_encoding_converter(Interp *interpreter, ENCODING *lhs, ENCODING *rhs);

+
+#define ENCODING_MAX_BYTES_PER_CODEPOINT(interp, source) ((ENCODING *)source->encoding)->max_bytes_per_codepoint
#define ENCODING_TO_ENCODING(interp, source, offset, count) ((ENCODING *)source->encoding)->to_encoding(interp, source, offset, count)

#define ENCODING_COPY_TO_ENCODING(interp, source) ((ENCODING *)source->encoding)->copy_to_encoding(interp, source)

#define ENCODING_GET_CODEPOINT(interp, source, offset) ((ENCODING *)source->encoding)->get_codepoint(interp, source, offset)

@@no-spam -58,7 +79,9 @@no-spam #define ENCODING_GET_BYTE(interp, source, offset) ((ENCODING *)source->encoding)->get_byte(interp, source, offset)

#define ENCODING_SET_BYTE(interp, source, offset, value) ((ENCODING *)source->encoding)->set_byte(interp, source, offset, value)

#define ENCODING_GET_CODEPOINTS(interp, source, offset, count) ((ENCODING *)source->encoding)->get_codepoints(interp, source, offset, count)

+#define ENCODING_GET_CODEPOINTS_INPLACE(interp, source, dest, offset, count) ((ENCODING *)source->encoding)->get_codepoints_inplace(interp, source, dest, offset, count)

#define ENCODING_GET_BYTES(interp, source, offset, count) ((ENCODING *)source->encoding)->get_bytes(interp, source, offset, count)

+#define ENCODING_GET_BYTES_INPLACE(interp, source, offset, count, dest) ((ENCODING *)source->encoding)->get_bytes_inplace(interp, source, offset, count, dest)

#define ENCODING_SET_CODEPOINTS(interp, source, offset, count, newdata) ((ENCODING *)source->encoding)->set_codepoints(interp, source, offset, count, newdata)

#define ENCODING_SET_BYTES(interp, source, offset, count, newdata) ((ENCODING *)source->encoding)->set_bytes(interp, source, offset, count, newdata)

#define ENCODING_BECOME_ENCODING(interp, source) ((ENCODING *)source->encoding)->become_encoding(interp, source)

1.23 +3 -2 parrot/include/parrot/misc.h Index: misc.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/misc.h,v retrieving revision 1.22
retrieving revision 1.23
diff -u -r1.22 -r1.23
--- misc.h 18 Oct 2004 01:35:25 -0000 1.22
+++ misc.h 27 Feb 2005 09:58:43 -0000 1.23
@@no-spam -1,7 +1,7 @@no-spam /* misc.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: misc.h,v 1.22 2004/10/18 01:35:25 brentdax Exp $
+ * $Id: misc.h,v 1.23 2005/02/27 09:58:43 leo Exp $
* Overview:
* Miscellaneous functions, mainly the Parrot_sprintf family * Data Structure and Algorithms:
@@no-spam -41,7 +41,8 @@no-spam void Parrot_destroy_la(long *);
void Parrot_destroy_cpa(char **);
PMC* tm_to_array(Parrot_Interp interpreter, struct tm *tm);
-
+INTVAL Parrot_byte_index(Interp *interpreter, STRING *base, STRING *search, UINTVAL start_offset);

+INTVAL Parrot_byte_rindex(Interp *interpreter, STRING *base, STRING *search, UINTVAL start_offset);

/*
* misc.c */
1.50 +2 -2 parrot/include/parrot/pobj.h Index: pobj.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/pobj.h,v retrieving revision 1.49
retrieving revision 1.50
diff -u -r1.49 -r1.50
--- pobj.h 3 Nov 2004 20:58:09 -0000 1.49
+++ pobj.h 27 Feb 2005 09:58:43 -0000 1.50
@@no-spam -1,7 +1,7 @@no-spam /* pobj.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: pobj.h,v 1.49 2004/11/03 20:58:09 dan Exp $
+ * $Id: pobj.h,v 1.50 2005/02/27 09:58:43 leo Exp $
* Overview:
* Parrot Object data members and flags enum * Data Structure and Algorithms:
@@no-spam -128,7 +128,7 @@no-spam UINTVAL bufused;
void *strstart;
UINTVAL strlen;
- parrot_string_representation_t representation;
+ /* parrot_string_representation_t representation;*/
void *encoding; /* These should be of type ENCODING * and CHARSET *
* respectively, but I'm not sure how to get them * to do that without a whole lotta work right now */
1.47 +4 -1 parrot/include/parrot/string_funcs.h Index: string_funcs.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v retrieving revision 1.46
retrieving revision 1.47
diff -u -r1.46 -r1.47
--- string_funcs.h 3 Nov 2004 21:44:38 -0000 1.46
+++ string_funcs.h 27 Feb 2005 09:58:43 -0000 1.47
@@no-spam -1,7 +1,7 @@no-spam /* string_funcs.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: string_funcs.h,v 1.46 2004/11/03 21:44:38 dan Exp $
+ * $Id: string_funcs.h,v 1.47 2005/02/27 09:58:43 leo Exp $
* Overview:
* This is the api header for the string subsystem * Data Structure and Algorithms:
@@no-spam -17,6 +17,8 @@no-spam /* Declarations of accessors */
+void Parrot_reuse_COW_reference(Interp *, STRING *, STRING *);
+STRING *Parrot_make_COW_reference(Interp *, STRING *);
void Parrot_unmake_COW(Interp *, STRING *);
INTVAL string_compute_strlen(Interp *, STRING *);
INTVAL string_max_bytes(Interp *, STRING*, INTVAL);
@@no-spam -54,6 +56,7 @@no-spam parrot_string_representation_t representation);
STRING *string_make(Interp *interpreter, const void *buffer,
UINTVAL len, const char *encoding_name, UINTVAL flags);
+STRING *string_make_direct(Interp *interpreter, const void *buffer, UINTVAL len, ENCODING *encoding, CHARSET *charset, UINTVAL flags);

STRING * string_make_empty(Interp *interpreter,
parrot_string_representation_t representation,
UINTVAL capacity);
1.5 +2 -3 parrot/include/parrot/string_primitives.h Index: string_primitives.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_primitives.h,v retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- string_primitives.h 18 Oct 2004 01:35:25 -0000 1.4
+++ string_primitives.h 27 Feb 2005 09:58:43 -0000 1.5
@@no-spam -1,7 +1,7 @@no-spam /* string_funcs.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: string_primitives.h,v 1.4 2004/10/18 01:35:25 brentdax Exp $
+ * $Id: string_primitives.h,v 1.5 2005/02/27 09:58:43 leo Exp $
* Overview:
* This is the api header for the string subsystem * Data Structure and Algorithms:
@@no-spam -26,8 +26,7 @@no-spam /* Utility method which knows how to uwind a single escape sequence */
typedef Parrot_UInt2 (*Parrot_unescape_cb)(Parrot_Int4 offset, void *context);

Parrot_UInt4
-string_unescape_one(Parrot_unescape_cb cb,
- Parrot_UInt4 *offset, Parrot_UInt4 input_length, void *string);
+string_unescape_one(Interp *interpreter, UINTVAL *offset, STRING *string);
UINTVAL Parrot_char_digit_value(Interp *interpreter, UINTVAL character);
1.110 +3 -2 parrot/io/io.c Index: io.c ===================================================================
RCS file: /cvs/public/parrot/io/io.c,v retrieving revision 1.109
retrieving revision 1.110
diff -u -r1.109 -r1.110
--- io.c 14 Feb 2005 10:57:21 -0000 1.109
+++ io.c 27 Feb 2005 09:58:44 -0000 1.110
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: io.c,v 1.109 2005/02/14 10:57:21 leo Exp $
+$Id: io.c,v 1.110 2005/02/27 09:58:44 leo Exp $
=head1 NAME @@no-spam -116,7 +116,8 @@no-spam PObj_buflen(s) = len;
PObj_sysmem_SET(s);
PObj_external_SET(s);
- s->representation = enum_stringrep_one;
+ s->charset = Parrot_iso_8859_1_charset_ptr;
+ s->encoding = Parrot_fixed_8_encoding_ptr;
/*
* TODO encoding = raw */
1.36 +5 -3 parrot/io/io_buf.c Index: io_buf.c ===================================================================
RCS file: /cvs/public/parrot/io/io_buf.c,v retrieving revision 1.35
retrieving revision 1.36
diff -u -r1.35 -r1.36
--- io_buf.c 14 Feb 2005 11:34:22 -0000 1.35
+++ io_buf.c 27 Feb 2005 09:58:44 -0000 1.36
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: io_buf.c,v 1.35 2005/02/14 11:34:22 leo Exp $
+$Id: io_buf.c,v 1.36 2005/02/27 09:58:44 leo Exp $
=head1 NAME @@no-spam -582,7 +582,8 @@no-spam len = b->endb - buf_start;
if (s->bufused < l) {
if (may_realloc) {
- s->representation = enum_stringrep_one;
+ s->charset = Parrot_iso_8859_1_charset_ptr;
+ s->encoding = Parrot_fixed_8_encoding_ptr;
if (s->strstart) {
Parrot_reallocate_string(interpreter, s, l);
} else {
@@no-spam -602,7 +603,8 @@no-spam }
if (s->bufused < l) {
if (may_realloc) {
- s->representation = enum_stringrep_one;
+ s->charset = Parrot_iso_8859_1_charset_ptr;
+ s->encoding = Parrot_fixed_8_encoding_ptr;
if (s->strstart) {
Parrot_reallocate_string(interpreter, s, l);
} else {
1.25 +2 -1 parrot/lib/Parrot/Configure/Step.pm Index: Step.pm ===================================================================
RCS file: /cvs/public/parrot/lib/Parrot/Configure/Step.pm,v retrieving revision 1.24
retrieving revision 1.25
diff -u -r1.24 -r1.25
--- Step.pm 17 Jan 2005 14:02:06 -0000 1.24
+++ Step.pm 27 Feb 2005 09:58:45 -0000 1.25
@@no-spam -1,5 +1,5 @@no-spam # Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: Step.pm,v 1.24 2005/01/17 14:02:06 leo Exp $
+# $Id: Step.pm,v 1.25 2005/02/27 09:58:45 leo Exp $
=head1 NAME @@no-spam -225,6 +225,7 @@no-spam \$\{(\w+)\}
}{
if(defined(my $val=Configure::Data->get($1))) {
+ #use Data::Dumper;warn Dumper("val for $1 is ",$val);
$val;
}
else {
1.19 +3 -2 parrot/pf/pf_items.c Index: pf_items.c ===================================================================
RCS file: /cvs/public/parrot/pf/pf_items.c,v retrieving revision 1.18
retrieving revision 1.19
diff -u -r1.18 -r1.19
--- pf_items.c 9 Jul 2004 08:43:07 -0000 1.18
+++ pf_items.c 27 Feb 2005 09:58:46 -0000 1.19
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: pf_items.c,v 1.18 2004/07/09 08:43:07 leo Exp $
+$Id: pf_items.c,v 1.19 2005/02/27 09:58:46 leo Exp $
=head1 NAME @@no-spam -580,6 +580,7 @@no-spam PF_store_string(opcode_t *cursor, STRING *s)
{
opcode_t padded_size = s->bufused;
+ opcode_t representation;
char *charcursor;
size_t i;
@@no-spam -590,7 +591,7 @@no-spam }
*cursor++ = PObj_get_FLAGS(s); /* only constant_FLAG and private7 */
- *cursor++ = s->representation;
+ *cursor++ = enum_stringrep_one;
*cursor++ = s->bufused;
/* Switch to char * since rest of string is addressed by 1.3 +58 -1 parrot/src/charset.c Index: charset.c ===================================================================
RCS file: /cvs/public/parrot/src/charset.c,v retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- charset.c 3 Nov 2004 19:10:48 -0000 1.2
+++ charset.c 27 Feb 2005 09:58:47 -0000 1.3
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: charset.c,v 1.2 2004/11/03 19:10:48 dan Exp $
+$Id: charset.c,v 1.3 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -12,13 +12,70 @@no-spam */
+#define PARROT_NO_EXTERN_CHARSET_PTRS #include "parrot/parrot.h"
+CHARSET *Parrot_iso_8859_1_charset_ptr;
+CHARSET *Parrot_binary_charset_ptr;
+CHARSET *Parrot_default_charset_ptr;
+CHARSET *Parrot_unicode_charset_ptr;
+
CHARSET *Parrot_new_charset(Interp *interpreter) {
return mem_sys_allocate(sizeof(CHARSET));
}
+CHARSET *Parrot_find_charset(Interp *interpreter, const char *charsetname) {
+ if (!strcmp("iso-8859-1", charsetname)) {
+ return Parrot_iso_8859_1_charset_ptr;
+ }
+ if (!strcmp("unicode", charsetname)) {
+ return Parrot_unicode_charset_ptr;
+ }
+ if (!strcmp("binary", charsetname)) {
+ return Parrot_binary_charset_ptr;
+ }
+ return NULL;
+}
+
+CHARSET *Parrot_load_charset(Interp *interpreter, const char *charsetname) {
+ internal_exception(UNIMPLEMENTED, "Can't load charsets yet");
+ return NULL;
+}
+
+INTVAL Parrot_register_charset(Interp *interpreter, const char *charsetname, CHARSET *charset) {

+ if (!strcmp("binary", charsetname)) {
+ Parrot_binary_charset_ptr = charset;
+ return 1;
+ }
+ if (!strcmp("iso-8859-1", charsetname)) {
+ Parrot_iso_8859_1_charset_ptr = charset;
+ if (!Parrot_default_charset_ptr) {
+ Parrot_default_charset_ptr = charset;
+
+ }
+ return 1;
+ }
+ if (!strcmp("unicode", charsetname)) {
+ Parrot_unicode_charset_ptr = charset;
+ return 1;
+ }
+ return 0;
+}
+
+INTVAL Parrot_make_default_charset(Interp *interpreter, const char *charsetname, CHARSET *charset) {

+ Parrot_default_charset_ptr = charset;
+ return 1;
+}
+
+CHARSET *Parrot_default_charset(Interp *interpreter) {
+ return Parrot_default_charset_ptr;
+}
+
+charset_converter_t Parrot_find_charset_converter(Interp *interpreter, CHARSET *lhs, CHARSET *rhs) {

+ return NULL;
+}
+
/*
* Local variables:
* c-indentation-style: bsd 1.22 +42 -4 parrot/src/encoding.c Index: encoding.c ===================================================================
RCS file: /cvs/public/parrot/src/encoding.c,v retrieving revision 1.21
retrieving revision 1.22
diff -u -r1.21 -r1.22
--- encoding.c 3 Nov 2004 19:10:48 -0000 1.21
+++ encoding.c 27 Feb 2005 09:58:47 -0000 1.22
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: encoding.c,v 1.21 2004/11/03 19:10:48 dan Exp $
+$Id: encoding.c,v 1.22 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -12,19 +12,57 @@no-spam */
+#define PARROT_NO_EXTERN_ENCODING_PTRS #include "parrot/parrot.h"
+ENCODING *Parrot_default_encoding_ptr;
+ENCODING *Parrot_fixed_8_encoding_ptr;
+
/* Yep, this needs to be a char * parameter -- it's tough to load in encodings and such for strings if we can't be sure we've got enough info set up to actually build strings... */
-ENCODING *Parrot_load_encoding(Interp *intepreter, const char *encoding_name) {

- return NULL;
-}
ENCODING *Parrot_new_encoding(Interp *interpreter) {
return mem_sys_allocate(sizeof(ENCODING));
}
+ENCODING *Parrot_find_encoding(Interp *interpreter, const char *encodingname) {

+ if (!strcmp("fixed_8", encodingname)) {
+ return Parrot_fixed_8_encoding_ptr;
+ }
+ return NULL;
+}
+
+ENCODING *Parrot_load_encoding(Interp *interpreter, const char *encodingname) {

+ internal_exception(UNIMPLEMENTED, "Can't load encodings yet");
+ return NULL;
+}
+
+INTVAL Parrot_register_encoding(Interp *interpreter, const char *encodingname, ENCODING *encoding) {

+ if (!strcmp("fixed_8", encodingname)) {
+ Parrot_fixed_8_encoding_ptr = encoding;
+ if (!Parrot_default_encoding_ptr) {
+ Parrot_default_encoding_ptr = encoding;
+
+ }
+ return 1;
+ }
+ return 0;
+}
+
+INTVAL Parrot_make_default_encoding(Interp *interpreter, const char *encodingname, ENCODING *encoding) {

+ Parrot_default_encoding_ptr = encoding;
+ return 1;
+}
+
+ENCODING *Parrot_default_encoding(Interp *interpreter) {
+ return Parrot_default_encoding_ptr;
+}
+
+encoding_converter_t Parrot_find_encoding_converter(Interp *interpreter, ENCODING *lhs, ENCODING *rhs) {

+ return NULL;
+}
+
/*
* Local variables:
* c-indentation-style: bsd 1.9 +2 -2 parrot/src/global.c Index: global.c ===================================================================
RCS file: /cvs/public/parrot/src/global.c,v retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- global.c 13 Dec 2004 13:46:25 -0000 1.8
+++ global.c 27 Feb 2005 09:58:47 -0000 1.9
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: global.c,v 1.8 2004/12/13 13:46:25 leo Exp $
+$Id: global.c,v 1.9 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -54,7 +54,7 @@no-spam * hash lookup duplication */
HashBucket *b;
-#ifdef FIND_DEBUG +#ifdef DEBUG_FIND PIO_printf(interpreter, "find_global class '%Ss' meth '%Ss\n",
class, globalname);
#endif 1.57 +1 -11 parrot/src/global_setup.c Index: global_setup.c ===================================================================
RCS file: /cvs/public/parrot/src/global_setup.c,v retrieving revision 1.56
retrieving revision 1.57
diff -u -r1.56 -r1.57
--- global_setup.c 13 Dec 2004 13:46:25 -0000 1.56
+++ global_setup.c 27 Feb 2005 09:58:47 -0000 1.57
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: global_setup.c,v 1.56 2004/12/13 13:46:25 leo Exp $
+$Id: global_setup.c,v 1.57 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -24,10 +24,6 @@no-spam #define INSIDE_GLOBAL_SETUP #include "parrot/parrot.h"
-#include "../encodings/fixed_8.h"
-#include "../charset/ascii.h"
-#include "../charset/binary.h"
-
/* These functions are defined in the auto-generated file core_pmcs.c */
extern void Parrot_initialize_core_pmcs(Interp *interp);
extern void Parrot_register_core_pmcs(Interp *interp, PMC *registry);
@@no-spam -62,12 +58,6 @@no-spam Parrot_platform_init_code();
#endif - /* Load in the basic encodings and charsets */
- Parrot_encoding_fixed_8_init(interpreter);
- Parrot_charset_binary_init(interpreter);
- Parrot_charset_ascii_init(interpreter);
-
-
/*
* TODO allocate core vtable table only once - or per interpreter * divide globals into real globals and per interpreter 1.131 +3 -2 parrot/src/objects.c Index: objects.c ===================================================================
RCS file: /cvs/public/parrot/src/objects.c,v retrieving revision 1.130
retrieving revision 1.131
diff -u -r1.130 -r1.131
--- objects.c 23 Feb 2005 09:45:03 -0000 1.130
+++ objects.c 27 Feb 2005 09:58:47 -0000 1.131
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: objects.c,v 1.130 2005/02/23 09:45:03 leo Exp $
+$Id: objects.c,v 1.131 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -198,7 +198,8 @@no-spam delegate_vtable = Parrot_base_vtables[enum_class_delegate];
memset(&meth_str, 0, sizeof(meth_str));
- meth_str.representation = enum_stringrep_one;
+ meth_str.encoding = Parrot_fixed_8_encoding_ptr;
+ meth_str.charset = Parrot_iso_8859_1_charset_ptr;
for (i = 0; (meth = Parrot_vtable_slot_names[i]); ++i) {
if (!*meth)
continue;
1.94 +2 -2 parrot/src/pmc.c Index: pmc.c ===================================================================
RCS file: /cvs/public/parrot/src/pmc.c,v retrieving revision 1.93
retrieving revision 1.94
diff -u -r1.93 -r1.94
--- pmc.c 25 Jan 2005 14:47:33 -0000 1.93
+++ pmc.c 27 Feb 2005 09:58:47 -0000 1.94
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: pmc.c,v 1.93 2005/01/25 14:47:33 leo Exp $
+$Id: pmc.c,v 1.94 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -456,7 +456,7 @@no-spam class_name, 0) == 0);
for (pos = 0; ;) {
len = string_length(interpreter, class_name);
- pos += len + 1;
+ pos += len+1;
if (pos >= (INTVAL)string_length(interpreter, vtable->isa_str))
break;
len = string_str_index(interpreter, vtable->isa_str,
1.232 +168 -967 parrot/src/string.c Index: string.c ===================================================================
RCS file: /cvs/public/parrot/src/string.c,v retrieving revision 1.231
retrieving revision 1.232
diff -u -r1.231 -r1.232
--- string.c 15 Feb 2005 08:55:41 -0000 1.231
+++ string.c 27 Feb 2005 09:58:47 -0000 1.232
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.231 2005/02/15 08:55:41 leo Exp $
+$Id: string.c,v 1.232 2005/02/27 09:58:47 leo Exp $
=head1 NAME @@no-spam -23,6 +23,12 @@no-spam #include "parrot/parrot.h"
#include <assert.h>
+#include "../encodings/fixed_8.h"
+#include "../charset/ascii.h"
+#include "../charset/binary.h"
+#include "../charset/iso-8859-1.h"
+
+
/*
* this extra size is in the hope, that some concat ops might * follow in a sequence.
@@no-spam -37,16 +43,15 @@no-spam #define EXTRA_SIZE 256
-/* statics */
-static void _string_upscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation, UINTVAL capacity);
-static void _string_downscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation);
-static parrot_string_representation_t _string_smallest_representation(
- Interp *interpreter, STRING *s);
+
+#define saneify_string(s) \
+ assert(s->encoding); \
+ assert(s->charset)
+
/*
+
=head2 String COW support =over 4
@@no-spam -122,7 +127,7 @@no-spam /*
=item C<static STRING *
-make_COW_reference(Interp *interpreter, STRING *s)>
+Parrot_make_COW_reference(Interp *interpreter, STRING *s)>
Creates a copy-on-write string by cloning a string header without allocating a new buffer.
@@no-spam -131,8 +136,8 @@no-spam */
-static STRING *
-make_COW_reference(Interp *interpreter, STRING *s)
+STRING *
+Parrot_make_COW_reference(Interp *interpreter, STRING *s)
{
STRING *d;
if (s == NULL) {
@@no-spam -153,11 +158,42 @@no-spam }
return d;
}
+/*
+
+=item C<static STRING *
+Parrot_reuse_COW_reference(Interp *interpreter, STRING *s, STRING *reuse)>
+
+Creates a copy-on-write string by cloning a string header without +allocating a new buffer. Doesn't allocate a new string header, instead +using the one passed in +
+=cut +
+*/
+
+void +Parrot_reuse_COW_reference(Interp *interpreter, STRING *s, STRING *d)
+{
+ if (s == NULL) {
+ return;
+ }
+ if (PObj_constant_TEST(s)) {
+ PObj_constant_CLEAR(s);
+ PObj_is_cowed_SETALL(s);
+ copy_string_header(interpreter, d, s);
+ PObj_constant_CLEAR(d);
+ }
+ else {
+ PObj_COW_SET(s);
+ copy_string_header(interpreter, d, s);
+ PObj_sysmem_CLEAR(d);
+ }
+}
/*
=item C<static void -make_COW_reference_from_header(Interp *interpreter,
+Parrot_make_COW_reference_from_header(Interp *interpreter,
STRING *s, STRING *d)>
Makes the second Parrot string a copy-on-write reference to first.
@@no-spam -167,7 +203,7 @@no-spam */
static void -make_COW_reference_from_header(Interp *interpreter,
+Parrot_make_COW_reference_from_header(Interp *interpreter,
STRING *s, STRING *d)
{
if (PObj_constant_TEST(s)) {
@@no-spam -207,10 +243,10 @@no-spam mem_sys_free((INTVAL*)PObj_bufstart(dest) - 1);
}
#endif - make_COW_reference_from_header(interpreter, src, dest);
+ Parrot_make_COW_reference_from_header(interpreter, src, dest);
}
else - dest = make_COW_reference(interpreter, src);
+ dest = Parrot_make_COW_reference(interpreter, src);
return dest;
}
@@no-spam -252,6 +288,12 @@no-spam } __ptr_u;
if (!interpreter->parent_interpreter) {
+ /* Load in the basic encodings and charsets */
+ Parrot_encoding_fixed_8_init(interpreter);
+ Parrot_charset_binary_init(interpreter);
+ Parrot_charset_ascii_init(interpreter);
+ Parrot_charset_iso_8859_1_init(interpreter);
+
/* DEFAULT_ICU_DATA_DIR is configured at build time, or it may be set through the $PARROT_ICU_DATA_DIR environment variable. Need a way to specify this via the command line as well. */
@@no-spam -282,7 +324,8 @@no-spam if (free_data_dir)
mem_sys_free((void*)data_dir); /* cast away the constness */
}
-/*
+
+/* --- Perhaps these should be uncommented - Leo encoding_init();
chartype_init();
string_native_type = chartype_lookup("usascii");
@@no-spam -327,8 +370,13 @@no-spam UINTVAL string_capacity(Interp *interpreter, STRING *s)
{
+ if (s->encoding) {
return ((ptrcast_t)PObj_bufstart(s) + PObj_buflen(s) -
- (ptrcast_t)s->strstart) / (s->representation);
+ (ptrcast_t)s->strstart) / ENCODING_MAX_BYTES_PER_CODEPOINT(interpreter, s);

+ } else {
+ return ((ptrcast_t)PObj_bufstart(s) + PObj_buflen(s) -
+ (ptrcast_t)s->strstart);
+ }
}
/*
@@no-spam -351,7 +399,12 @@no-spam s = new_string_header(interpreter, 0);
- s->representation = representation;
+ if (representation == enum_stringrep_one) {
+ s->encoding = PARROT_DEFAULT_ENCODING;
+ s->charset = PARROT_DEFAULT_CHARSET;
+ } else {
+ internal_exception(INVALID_CHARTYPE, "Unsupported representation");
+ }
Parrot_allocate_string(interpreter,
s, string_max_bytes(interpreter, s, capacity));
@@no-spam -359,145 +412,6 @@no-spam return s;
}
-/* downscale would need checks? */
-static void -_string_upscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation, UINTVAL capacity)
-{
- if (s->representation >= representation) {
- if (capacity > s->strlen) {
- string_grow(interpreter, s, capacity - s->strlen);
- }
- }
- else { /* s->representation < representation */
- STRING *temp;
- UINTVAL needed_length = s->strlen;
-
- if (capacity > needed_length)
- needed_length = capacity;
-
- temp = string_make_empty(interpreter, representation, needed_length);

- string_append(interpreter, temp, s, s->obj.flags);
- string_set(interpreter, s, temp);
- s->hashval = 0;
-
- /*
- s->representation = temp->representation;
- s->bufstart = temp->bufstart;
- s->buflen = temp->buflen;
- s->strstart = temp->strstart;
- s->bufused = temp->bufused;
- s->obj.flags = temp->obj.flags;
- */
- }
-}
-
-/* currently, doesn't do any checks to see if the downscale should be allowed.

- thus, assumes caller has already checked. */
-static void -_string_downscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation)
-{
- if (s->representation <= representation) {
- return; /* do nothing */
- }
- else { /* s->representation > representation */
- UINTVAL count = s->strlen;
-
- if (s->representation == enum_stringrep_four) {
- Parrot_UInt4 *oldCursor = (Parrot_UInt4*)s->strstart;
-
- if (representation == enum_stringrep_two) {
- Parrot_UInt2 *newCursor = (Parrot_UInt2*)s->strstart;
-
- while (count--) {
- *(newCursor++) = *(oldCursor++);
- }
- }
- else { /* representation == enum_stringrep_one */
- Parrot_UInt1 *newCursor = (Parrot_UInt1*)s->strstart;
-
- while (count--) {
- *(newCursor++) = *(oldCursor++);
- }
- }
- }
- else {
- /*
- * s-> representation == enum_stringrep_two,
- * representation == enum_stringrep_one - */
- Parrot_UInt2 *oldCursor = (Parrot_UInt2*)s->strstart;
- Parrot_UInt1 *newCursor = (Parrot_UInt1*)s->strstart;
-
- while (count--) {
- assert(*oldCursor <= 0xFF);
- *(newCursor++) = *(oldCursor++);
- }
- }
-
- s->representation = representation;
- s->bufused = string_max_bytes(interpreter, s, s->strlen);
- }
-}
-
-/* temporary */
-void Parrot_string_downscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation);
-
-void -Parrot_string_downscale(Interp *interpreter, STRING *s,
- parrot_string_representation_t representation)
-{
- _string_downscale(interpreter, s, representation);
-}
-
-static parrot_string_representation_t -_string_smallest_representation(Interp *interpreter, STRING *s)
-{
- if (s->representation == enum_stringrep_one) {
- return enum_stringrep_one;
- }
- else if (s->representation == enum_stringrep_two) {
- Parrot_UInt2 *cur = (Parrot_UInt2 *)s->strstart;
- Parrot_UInt2 *end = cur + s->strlen;
-
- while (cur < end) {
- if (*cur > 255) {
- return enum_stringrep_two;
- }
- cur++;
- }
-
- return enum_stringrep_one;
- }
- else if (s->representation == enum_stringrep_four) {
- Parrot_UInt4 *cur = (Parrot_UInt4 *)s->strstart;
- Parrot_UInt4 *end = cur + s->strlen;
- int saw_two = 0;
-
- while (cur < end) {
- if (*cur > 0xFFFF) {
- return enum_stringrep_four;
- }
- else if (*cur > 255) {
- saw_two = 1;
- }
- cur++;
- }
-
- if (saw_two) {
- return enum_stringrep_two;
- }
- else {
- return enum_stringrep_one;
- }
- }
- else { /* trouble */
- return enum_stringrep_unknown;
- }
-}
-
/*
=item C<STRING *
@@no-spam -527,6 +441,9 @@no-spam if (a == NULL)
return string_copy(interpreter, b);
+ saneify_string(a);
+ saneify_string(b);
+
a_capacity = string_capacity(interpreter, a);
total_length = string_length(interpreter, a) +
string_length(interpreter, b);
@@no-spam -541,11 +458,6 @@no-spam total_length = string_length(interpreter, a)
+ string_length(interpreter, b);
- if (a->representation < b->representation) {
- _string_upscale(interpreter, a, b->representation, total_length);
- }
-
- if (a->representation >= b->representation) {
/* make sure A's big enough for both */
if (a_capacity < total_length)
{
@@no-spam -559,7 +471,7 @@no-spam /* A is now ready to receive the contents of B */
/* if same rep, can memcopy */
- if (a->representation == b->representation) {
+ if (a->encoding == b->encoding && a->charset == b->charset) {
/* Tack B on the end of A */
mem_sys_memcopy((void *)((ptrcast_t)a->strstart + a->bufused),
b->strstart, b->bufused);
@@no-spam -569,74 +481,7 @@no-spam return a;
}
else {
- /* if not, need to loop - * fast_byte_append v. safe_byte_append - */
- /* remember, this is the case of rep A > rep B */
- if (a->representation == enum_stringrep_two) {
- /* B must have rep one */
- Parrot_UInt2 *a_cursor = (Parrot_UInt2 *)
- ((ptrcast_t)a->strstart + a->bufused);
- Parrot_UInt1 *b_cursor = (Parrot_UInt1 *)
- ((ptrcast_t)b->strstart);
- Parrot_UInt1 *b_end = (Parrot_UInt1 *)
- ((ptrcast_t)b->strstart + b->bufused);
-
- while (b_cursor < b_end) {
- *(a_cursor++) = *(b_cursor++);
- }
-
- a->bufused = (ptrcast_t)a_cursor - (ptrcast_t)a->strstart;
- string_compute_strlen(interpreter, a);
- }
- else if (a->representation == enum_stringrep_four) {
- Parrot_UInt4 *a_cursor = (Parrot_UInt4 *)
- ((ptrcast_t)a->strstart + a->bufused);
-
- switch (b->representation) {
- case enum_stringrep_one:
- {
- Parrot_UInt1 *b_cursor = (Parrot_UInt1 *)
- ((ptrcast_t)b->strstart);
- Parrot_UInt1 *b_end = (Parrot_UInt1 *)
- ((ptrcast_t)b->strstart + b->bufused);
-
- while (b_cursor < b_end) {
- *(a_cursor++) = *(b_cursor++);
- }
-
- a->bufused = (ptrcast_t)a_cursor - (ptrcast_t)a->strstart;

- string_compute_strlen(interpreter, a);
-
- break;
- }
- case enum_stringrep_two:
- {
- Parrot_UInt2 *b_cursor = (Parrot_UInt2 *)
- ((ptrcast_t)b->strstart);
- Parrot_UInt2 *b_end = (Parrot_UInt2 *)
- ((ptrcast_t)b->strstart + b->bufused);
-
- while (b_cursor < b_end) {
- *(a_cursor++) = *(b_cursor++);
- }
-
- a->bufused = (ptrcast_t)a_cursor -
- (ptrcast_t)a->strstart;
- string_compute_strlen(interpreter, a);
-
- break;
- }
- default:
- /* trouble */
- break;
- }
- }
- else - {
- /* problem */
- }
- }
+ internal_exception(UNIMPLEMENTED, "Cross-type string appending (%s/%s) (%s/%s) unsupported", ((ENCODING *)(a->encoding))->name, ((CHARSET *)(a->charset))->name, ((ENCODING *)(b->encoding))->name, ((CHARSET *)(b->charset))->name);

}
return a;
@@no-spam -659,9 +504,11 @@no-spam string_from_cstring(Interp *interpreter,
const void *buffer, UINTVAL len)
{
- return string_make(interpreter, buffer, len ? len :
+ return string_make_direct(interpreter, buffer, len ? len :
buffer ? strlen(buffer) : 0,
- "iso-8859-1", 0); /* make this utf-8 eventually? */
+ PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,

+ 0); /* Force an 8-bit encoding at some + point? */
}
/*
@@no-spam -681,9 +528,9 @@no-spam string_from_const_cstring(Interp *interpreter,
const void *buffer, UINTVAL len)
{
- return string_make(interpreter, buffer, len ? len :
+ return string_make_direct(interpreter, buffer, len ? len :
buffer ? strlen(buffer) : 0,
- "iso-8859-1", 0); /* make this utf-8 eventually? */
+ PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET, 0); /* make this utf-8 eventually? */

}
/*
@@no-spam -709,12 +556,6 @@no-spam case enum_stringrep_one:
return "iso-8859-1";
break;
- case enum_stringrep_two:
- return "ucs-2";
- break;
- case enum_stringrep_four:
- return "utf-32";
- break;
default:
internal_exception(INVALID_STRING_REPRESENTATION,
"string_primary_encoding_for_representation: "
@@no-spam -739,8 +580,9 @@no-spam const_string(Interp *interpreter, const char *buffer)
{
/* TODO cache the strings */
- return string_make(interpreter, buffer, strlen(buffer),
- "iso-8859-1", PObj_external_FLAG|PObj_constant_FLAG);
+ return string_make_direct(interpreter, buffer, strlen(buffer),
+ PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ PObj_external_FLAG|PObj_constant_FLAG);
}
/*
@@no-spam -773,6 +615,26 @@no-spam string_make(Interp *interpreter, const void *buffer,
UINTVAL len, const char *encoding_name, UINTVAL flags)
{
+ ENCODING *encoding;
+ CHARSET *charset;
+ if (!encoding_name) {
+ internal_exception(MISSING_ENCODING_NAME,
+ "string_make: no encoding name specified");
+ }
+
+ if (strcmp(encoding_name, "iso-8859-1") == 0 ) {
+ encoding = Parrot_fixed_8_encoding_ptr;
+ charset = Parrot_iso_8859_1_charset_ptr;
+ }
+ else {
+ internal_exception(UNIMPLEMENTED, "Can't make non-iso-8859-1 strings");

+ }
+ return string_make_direct(interpreter, buffer, len, encoding, charset, flags);

+
+}
+
+STRING *
+string_make_direct(Interp *interpreter, const void *buffer, UINTVAL len, ENCODING *encoding, CHARSET *charset, UINTVAL flags) {

STRING *s = NULL;
union {
const void * __c_ptr;
@@no-spam -787,16 +649,12 @@no-spam "string_make: buffer pointer NULL, but length nonzero");
}
- if (!encoding_name) {
- internal_exception(MISSING_ENCODING_NAME,
- "string_make: no encoding name specified");
- }
- else {
+
s = new_string_header(interpreter, flags);
- s->representation = enum_stringrep_unknown;
+ s->encoding = encoding;
+ s->charset = charset;
- if (strcmp(encoding_name, "iso-8859-1") == 0 ) {
- s->representation = enum_stringrep_one;
+ if (encoding == Parrot_fixed_8_encoding_ptr && charset == Parrot_iso_8859_1_charset_ptr) {

/*
* fast path for external (constant) strings - don't allocate * and copy data @@no-spam -814,17 +672,7 @@no-spam return s;
}
}
- else if (strcmp(encoding_name, "ucs-2") == 0 ) {
- /* worry about endian-ness */
- s->representation = enum_stringrep_two;
- }
- else if (strcmp(encoding_name, "utf-32") == 0 ) {
- s->representation = enum_stringrep_four;
- }
- if (s->representation != enum_stringrep_unknown) {
- /* fast encodings */
- /* decide in here on the size to use, and transcode.... */
Parrot_allocate_string(interpreter, s, len);
if (buffer) {
@@no-spam -835,11 +683,6 @@no-spam else {
s->strlen = s->bufused = 0;
}
- }
- else {
- string_fill_from_buffer(interpreter, buffer, len, encoding_name, s);

- }
- }
return s;
}
@@no-spam -888,30 +731,6 @@no-spam {
return s ? s->strlen : 0;
}
-
-/* XXX Is this right? */
-void *
-string_pointer_to_index(Interp * interpreter,
- const STRING *s, UINTVAL idx)
-{
- switch (s->representation) {
- case enum_stringrep_one:
- return ((Parrot_UInt1*)s->strstart + idx);
- break;
- case enum_stringrep_two:
- return ((Parrot_UInt2*)s->strstart + idx);
- break;
- case enum_stringrep_four:
- return ((Parrot_UInt4*)s->strstart + idx);
- break;
- default:
- internal_exception(INVALID_STRING_REPRESENTATION,
- "string_pointer_to_index: invalid string representation");
- return NULL; /* make compiler happy */
- break;
- }
-}
-
/*
=item C<INTVAL @@no-spam -932,159 +751,8 @@no-spam INTVAL string_index(Interp * interpreter, const STRING *s, UINTVAL idx)
{
- switch (s->representation) {
- case enum_stringrep_one:
- return *((Parrot_UInt1*)s->strstart + idx);
- break;
- case enum_stringrep_two:
- return *((Parrot_UInt2*)s->strstart + idx);
- break;
- case enum_stringrep_four:
- return *((Parrot_UInt4*)s->strstart + idx);
- break;
- default:
- internal_exception(INVALID_STRING_REPRESENTATION,
- "string_index: invalid string representation");
- return -1; /* make compiler happy */
- break;
- }
-}
-
-/*
-
-=item C<static INTVAL -string_str_index_twobyte(Interp *interpreter,
- const STRING *str, const STRING *find, UINTVAL start)>
-
-Helper function for C<string_str_index()>. This is optimized for the -simple case where both strings are in UCS-2 (C<enum_stringrep_two>). It -implements the Boyer-Moore string search algorithm.
-
-=cut -
-*/
-
-static INTVAL -string_str_index_twobyte(Interp *interpreter,
- const STRING *str, const STRING *find, UINTVAL start)
-{
- const Parrot_UInt2* const find_strstart = find->strstart;
- const Parrot_UInt2* const str_strstart = str->strstart;
- const UINTVAL find_strlen = find->strlen;
- const UINTVAL str_strlen = str->strlen;
- const Parrot_UInt2* const lastmatch = str_strstart + str_strlen;
- UINTVAL* p;
- const Parrot_UInt2* cp;
- UINTVAL endct, pos;
- UINTVAL badshift[256];
-
- /* Prepare the bad shift buffer */
-
- for (p = &badshift[0] ; p < &badshift[256] ; p++) {
- *p = find_strlen;
- }
-
- endct = 1;
- cp = find_strstart + find_strlen - 2; /* why "- 2" ? */
- for (; cp >= find_strstart ; cp--, endct++) {
- UINTVAL offset = (256 & *cp);
-
- if (endct < badshift[offset]) { /* or find_strlen != badshift[offset] */

- badshift[offset] = endct;
- }
- }
-
- /* Perform the match */
-
- pos = start;
- cp = str_strstart + start + find_strlen;
- while (cp <= lastmatch) {
- register const Parrot_UInt2* sp = cp;
- register const Parrot_UInt2* fp = find_strstart + find_strlen;
-
- while (fp > find_strstart) {
- if (*--fp != *--sp)
- break;
- }
- if (*fp == *sp) {
- return pos;
- }
- else {
- register UINTVAL bsi = badshift[256 & *(cp-1)];
- cp += bsi;
- pos += bsi;
- }
- }
-
- return -1;
-}
-
-/*
-
-=item C<static INTVAL -string_str_index_singlebyte(Interp *interpreter,
- const STRING *str, const STRING *find, UINTVAL start)>
-
-Helper function for C<string_str_index()>. This is optimized for the -simple case where both strings are in ISO-8859-1
-(C<enum_stringrep_one>). It implements the Boyer-Moore string search -algorithm.
-
-=cut -
-*/
-
-static INTVAL -string_str_index_singlebyte(Interp *interpreter,
- const STRING *str, const STRING *find, UINTVAL start)
-{
- const unsigned char* const find_strstart = find->strstart;
- const unsigned char* const str_strstart = str->strstart;
- const UINTVAL find_strlen = find->strlen;
- const UINTVAL str_strlen = str->strlen;
- const unsigned char* const lastmatch = str_strstart + str_strlen;
- UINTVAL* p;
- const unsigned char* cp;
- UINTVAL endct, pos;
- UINTVAL badshift[256];
-
- /* Prepare the bad shift buffer */
-
- for (p = &badshift[0] ; p < &badshift[256] ; p++) {
- *p = find_strlen;
- }
-
- endct = 1;
- cp = find_strstart + find_strlen - 2;
- for (; cp >= find_strstart ; cp--, endct++) {
- if (endct < badshift[*cp]) {
- badshift[*cp] = endct;
- }
- }
-
- /* Perform the match */
-
- pos = start;
- cp = str_strstart + start + find_strlen;
- while (cp <= lastmatch) {
- register const unsigned char* sp = cp;
- register const unsigned char* fp = find_strstart + find_strlen;
-
- while (fp > find_strstart) {
- if (*--fp != *--sp)
- break;
- }
- if (*fp == *sp) {
- return pos;
- }
- else {
- register UINTVAL bsi = badshift[*(cp-1)];
- cp += bsi;
- pos += bsi;
- }
- }
-
- return -1;
+ saneify_string(s);
+ return (INTVAL)CHARSET_GET_CODEPOINT(interpreter, s, idx);
}
/*
@@no-spam -1116,6 +784,17 @@no-spam if (!s2 || !string_length(interpreter, s2))
return -1;
+ saneify_string(s);
+#if 0
+ /* At startup we sometimes get empty strings. Not yet tracked down */
+ if (!s->charset) {
+ s->charset = Parrot_iso_8859_1_charset_ptr;
+ }
+ if (!s2->charset) {
+ s2->charset = Parrot_iso_8859_1_charset_ptr;
+ }
+#endif +
/* here, check of the search string has a different rep than the target string. if so, up- or down-size the search string. upsizing is easy.
for downsizing, need to check if there are any characters which won't @@no-spam -1127,28 +806,7 @@no-spam with search string smaller rep, and size mismatched with search string larger rep. */
- if (s->representation == s2->representation) {
- switch (s->representation) {
- case enum_stringrep_one:
- return string_str_index_singlebyte(interpreter, s, s2, start);

- break;
- case enum_stringrep_two:
- return string_str_index_twobyte(interpreter, s, s2, start);
- break;
- default:
- internal_exception(UNIMPLEMENTED,
- "string_str_index: case not implemented yet");
- return -1;
- break;
- break;
- }
- }
- else - {
- internal_exception(UNIMPLEMENTED,
- "string_str_index: case not implemented yet");
- }
- return -1;
+ return CHARSET_INDEX(interpreter, s, s2, start);
}
/*
@@no-spam -1216,21 +874,7 @@no-spam STRING *
string_chr(Interp *interpreter, UINTVAL character)
{
- if (character <= 0xFF) {
- Parrot_UInt1 c = (Parrot_UInt1)character;
- return string_make(interpreter,
- &c, (UINTVAL)sizeof(Parrot_UInt1), "iso-8859-1", 0);
- }
- else if (character <= 0xFFFF) {
- Parrot_UInt2 c = (Parrot_UInt2)character;
- return string_make(interpreter,
- &c, (UINTVAL)sizeof(Parrot_UInt2), "ucs-2", 0);
- }
- else {
- Parrot_UInt4 c = (Parrot_UInt4)character;
- return string_make(interpreter,
- &c, (UINTVAL)sizeof(Parrot_UInt4), "utf-32", 0);
- }
+ return Parrot_iso_8859_1_charset_ptr->string_from_codepoint(interpreter, character);

}
@@no-spam -1248,7 +892,7 @@no-spam STRING *
string_copy(Interp *interpreter, STRING *s)
{
- return make_COW_reference(interpreter, s);
+ return Parrot_make_COW_reference(interpreter, s);
}
@@no-spam -1274,8 +918,7 @@no-spam string_compute_strlen(Interp *interpreter, STRING *s)
{
/* taking advantage of int value of the enum */
- s->strlen = ((ptrcast_t)PObj_bufstart(s) + s->bufused -
- (ptrcast_t)s->strstart) / (s->representation);
+ s->strlen = CHARSET_CODEPOINTS(interpreter, s);
return s->strlen;
}
@@no-spam -1298,7 +941,10 @@no-spam /* XXXX: here (and a couple of other places) we are taking advantage the numerical value of s->representation being equal to sizeof(relevant type), and we probably shouldn't */
- return (nchars * (s->representation));
+ if (s->encoding) {
+ return (ENCODING_MAX_BYTES_PER_CODEPOINT(interpreter, s) * nchars);
+ }
+ return (nchars);
}
/*
@@no-spam -1324,9 +970,7 @@no-spam if (a != NULL && a->strlen != 0) {
if (b != NULL && b->strlen != 0) {
STRING *result =
- string_make_empty(interpreter,
- (a->representation >= b->representation) ?
- a->representation : b->representation,
+ string_make_empty(interpreter, enum_stringrep_one,
a->strlen + b->strlen);
string_append(interpreter, result, a, Uflags);
@@no-spam -1369,7 +1013,7 @@no-spam STRING *dest;
UINTVAL i;
- dest = string_make_empty(interpreter, s->representation, s->strlen * num);

+ dest = string_make_empty(interpreter, enum_stringrep_one, s->strlen * num);

/* dest = string_make(interpreter, NULL, s->bufused * num, s->encoding, 0,

s->type); */
@@no-spam -1414,6 +1058,7 @@no-spam UINTVAL true_offset;
UINTVAL true_length;
+ saneify_string(src);
true_offset = (UINTVAL)offset;
/* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
@@no-spam -1436,17 +1081,12 @@no-spam }
/* do in-place i.e. reuse existing header if one */
- if (replace_dest)
- dest = string_set(interpreter, *d, src);
+ if (replace_dest && *d) {
+ CHARSET_GET_CODEPOINTS_INPLACE(interpreter, src, *d, true_offset, true_length);

+ dest = *d;
+ }
else - dest = make_COW_reference(interpreter, src);
-
- dest->strstart = (char *)dest->strstart - + string_max_bytes(interpreter, dest, true_offset);
- dest->bufused = string_max_bytes(interpreter, dest, true_length);
-
- dest->strlen = true_length;
- dest->hashval = 0;
+ dest = CHARSET_GET_CODEPOINTS(interpreter, src, true_offset, true_length);

if (d != NULL) {
*d = dest;
@@no-spam -1497,8 +1137,8 @@no-spam true_length = (UINTVAL)length;
/* may have different reps..... */
- if (rep->representation < src->representation) {
- _string_upscale(interpreter, rep, src->representation, 0);
+ if (src->encoding != rep->encoding || src->charset != rep->charset) {
+ internal_exception(UNIMPLEMENTED, "Can't handle mixed types yet");
}
/* abs(-offset) may not be > strlen-1 */
@@no-spam -1524,7 +1164,7 @@no-spam if (d != NULL) {
UINTVAL length_bytes = string_max_bytes(interpreter, src, true_length);

- dest = string_make_empty(interpreter, src->representation, true_length);

+ dest = string_make_empty(interpreter, enum_stringrep_one, true_length);

mem_sys_memcopy(dest->strstart,
(char *)src->strstart @@no-spam -1539,21 +1179,6 @@no-spam /* Now do the replacement */
- /* this section could be more efficient, if we don't prescale the - whole string(s) */
- if (rep->representation > src->representation) {
- if ( _string_smallest_representation(interpreter, rep) <=
- src->representation ) {
- /* downsize replacement string */
- _string_downscale(interpreter, rep, src->representation);
- }
- else {
- /* must upsize target string; would be more efficient to do - such that the replacement is done at the same time */
- _string_upscale(interpreter, src, rep->representation, 0);
- }
- }
- /* either way, they now have the same rep */
/* XXXX: make sure the rest of this method is correct, vis-a-vis byte v.
character */
@@no-spam -1682,64 +1307,6 @@no-spam } \
} while(0)
-/*
-
-=item C<INTVAL -string_compare(Interp *interpreter,
- STRING *s1, STRING *s2)>
-
-Compares two Parrot strings, performing type and encoding conversions if -necessary.
-
-Returns the standard -1, 0, 1 comparison result, indicating whether -the first string was C<< < >>, C<==>, C<< > >> the second.
-
-=cut -
-*/
-
-static INTVAL -cmp_diff_repr(STRING *s1, STRING *s2)
-{
-
- /* make this 3 more cases, rather than 6 */
- INTVAL multiplier;
- STRING *larger;
- STRING *smaller;
- INTVAL cmp;
-
- if (s1->representation > s2->representation) {
- larger = s1;
- smaller = s2;
- multiplier = 1;
- }
- else {
- larger = s2;
- smaller = s1;
- multiplier = -1;
- }
-
- if (larger->representation == enum_stringrep_four) {
- if (smaller->representation == enum_stringrep_two) {
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2,
- larger, smaller, cmp);
- }
- else {
- /* smaller->representation == enum_stringrep_one */
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1,
- larger, smaller, cmp);
- }
- }
- else {
- /*
- * larger->representation == enum_stringrep_two,
- * smaller->representation == enum_stringrep_one - */
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
- }
-
- return cmp * multiplier;
-}
INTVAL string_compare(Interp *interpreter,
@@no-spam -1757,34 +1324,15 @@no-spam return -(s2->strlen != 0);
}
+ saneify_string(s1);
+ saneify_string(s2);
+
# if ! DISABLE_GC_DEBUG /* It's easy to forget that string comparison can trigger GC */
if (GC_DEBUG(interpreter))
Parrot_do_dod_run(interpreter, DOD_trace_stack_FLAG);
# endif -
- if (s1->representation == s2->representation) {
- switch (s1->representation) {
- case enum_stringrep_one:
- /* could use memcmp in this one case; faster?? */
- COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
- break;
- case enum_stringrep_two:
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
- break;
- case enum_stringrep_four:
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
- break;
- default:
- /* trouble! */
- break;
- }
-
- return cmp;
- }
- else {
- return cmp_diff_repr(s1, s2);
- }
+ return CHARSET_COMPARE(interpreter, s1, s2);
}
@@no-spam -1848,16 +1396,8 @@no-spam * both strings are non-null * both strings have same length */
+ return CHARSET_COMPARE(interpreter, s1, s2);
- if (s1->representation == s2->representation) {
- return memcmp(s1->strstart, s2->strstart, s1->bufused);
- }
- else {
- /* all the fast shortcuts have been taken - * now just left with compare - */
- return cmp_diff_repr(s1, s2);
- }
}
/*
@@no-spam -1922,8 +1462,6 @@no-spam /* think about case of dest string is one of the operands */
if (s1 && s2) {
minlen = s1->strlen > s2->strlen ? s2->strlen : s1->strlen;
- maxrep = s1->representation >= s2->representation ?
- s1->representation : s2->representation;
}
if (dest && *dest) {
@@no-spam -1942,69 +1480,20 @@no-spam return res;
}
else {
- _string_upscale(interpreter, res, maxrep, 0);
+ if (s1->encoding != s2->encoding || s1->charset != s2->charset) {
+ internal_exception(UNIMPLEMENTED, "Can't do cross-type bitwwise and");

+ }
}
-
#if ! DISABLE_GC_DEBUG /* trigger GC for debug */
if (interpreter && GC_DEBUG(interpreter))
Parrot_do_dod_run(interpreter, DOD_trace_stack_FLAG);
#endif - make_writable(interpreter, &res, minlen, res->representation);
+ make_writable(interpreter, &res, minlen, enum_stringrep_one);
- if (s1->representation == s2->representation) {
- switch (s1->representation) {
- case enum_stringrep_one:
BITWISE_AND_STRINGS(Parrot_UInt1, Parrot_UInt1,
Parrot_UInt1, s1, s2, res, minlen);
- break;
- case enum_stringrep_two:
- BITWISE_AND_STRINGS(Parrot_UInt2, Parrot_UInt2,
- Parrot_UInt2, s1, s2, res, minlen);
- break;
- case enum_stringrep_four:
- BITWISE_AND_STRINGS(Parrot_UInt4, Parrot_UInt4,
- Parrot_UInt4, s1, s2, res, minlen);
- break;
- default:
- /* trouble! */
- break;
- }
- }
- else {
- /* make this 3 more cases, rather than 6 */
- STRING *larger;
- STRING *smaller;
-
- if (s1->representation > s2->representation) {
- larger = s1;
- smaller = s2;
- }
- else {
- larger = s2;
- smaller = s1;
- }
-
- if (larger->representation == enum_stringrep_four) {
- if (smaller->representation == enum_stringrep_two) {
- BITWISE_AND_STRINGS(Parrot_UInt4, Parrot_UInt2,
- Parrot_UInt4, larger, smaller, res, minlen);
- }
- else {
- /* smaller->representation == enum_stringrep_one */
- BITWISE_AND_STRINGS(Parrot_UInt4, Parrot_UInt1,
- Parrot_UInt4, larger, smaller, res, minlen);
- }
- }
- else {
- /* larger->representation == enum_stringrep_two,
- * smaller->representation == enum_stringrep_one - */
- BITWISE_AND_STRINGS(Parrot_UInt2, Parrot_UInt1,
- Parrot_UInt2, larger, smaller, res, minlen);
- }
- }
res->strlen = minlen;
res->bufused = string_max_bytes(interpreter, res, res->strlen);
@@no-spam -2081,10 +1570,6 @@no-spam if (s2 && s2->bufused > maxlen)
maxlen = s2->bufused;
- maxrep = s1 ? s1->representation: enum_stringrep_one;
- if (s2 && s2->representation > maxrep)
- maxrep = s2->representation;
-
if (dest && *dest)
res = *dest;
else if (!s1 && !s2)
@@no-spam -2104,61 +1589,10 @@no-spam Parrot_do_dod_run(interpreter, DOD_trace_stack_FLAG);
#endif - make_writable(interpreter, &res, maxlen, res->representation);
+ make_writable(interpreter, &res, maxlen, enum_stringrep_one);
- if (!s1 || !s2 || (s1->representation == s2->representation)) {
- switch (maxrep) {
- case enum_stringrep_one:
BITWISE_OR_STRINGS(Parrot_UInt1, Parrot_UInt1, Parrot_UInt1,
s1, s2, res, maxlen, |);
- break;
- case enum_stringrep_two:
- BITWISE_OR_STRINGS(Parrot_UInt2, Parrot_UInt2, Parrot_UInt2,
- s1, s2, res, maxlen, |);
- break;
- case enum_stringrep_four:
- BITWISE_OR_STRINGS(Parrot_UInt4, Parrot_UInt4, Parrot_UInt4,
- s1, s2, res, maxlen, |);
- break;
- default:
- /* trouble! */
- break;
- }
- }
- else {
- /* make this 3 more cases, rather than 6 */
- STRING *larger;
- STRING *smaller;
-
- if (s1->representation > s2->representation) {
- larger = s1;
- smaller = s2;
- }
- else {
- larger = s2;
- smaller = s1;
- }
-
- if (larger->representation == enum_stringrep_four) {
- if (smaller->representation == enum_stringrep_two) {
- BITWISE_OR_STRINGS(Parrot_UInt4, Parrot_UInt2, Parrot_UInt4,
- larger, smaller, res, maxlen, |);
- }
- else {
- /* smaller->representation == enum_stringrep_one */
- BITWISE_OR_STRINGS(Parrot_UInt4, Parrot_UInt1, Parrot_UInt4,
- larger, smaller, res, maxlen, |);
- }
- }
- else {
- /* larger->representation == enum_stringrep_two,
- * smaller->representation == enum_stringrep_one - */
- BITWISE_OR_STRINGS(Parrot_UInt2, Parrot_UInt1, Parrot_UInt2,
- larger, smaller, res, maxlen, |);
- }
- }
-
res->strlen = maxlen;
res->bufused = string_max_bytes(interpreter, res, res->strlen);
@@no-spam -2194,10 +1628,6 @@no-spam if (s2 && s2->bufused > maxlen)
maxlen = s2->bufused;
- maxrep = s1 ? s1->representation: enum_stringrep_one;
- if (s2 && s2->representation > maxrep)
- maxrep = s2->representation;
-
if (dest && *dest)
res = *dest;
else if (!s1 && !s2)
@@no-spam -2217,61 +1647,10 @@no-spam Parrot_do_dod_run(interpreter, DOD_trace_stack_FLAG);
#endif - make_writable(interpreter, &res, maxlen, res->representation);
+ make_writable(interpreter, &res, maxlen, enum_stringrep_one);
- if (!s1 || !s2 || (s1->representation == s2->representation)) {
- switch (maxrep) {
- case enum_stringrep_one:
BITWISE_OR_STRINGS(Parrot_UInt1, Parrot_UInt1, Parrot_UInt1,
s1, s2, res, maxlen, ^);
- break;
- case enum_stringrep_two:
- BITWISE_OR_STRINGS(Parrot_UInt2, Parrot_UInt2, Parrot_UInt2,
- s1, s2, res, maxlen, ^);
- break;
- case enum_stringrep_four:
- BITWISE_OR_STRINGS(Parrot_UInt4, Parrot_UInt4, Parrot_UInt4,
- s1, s2, res, maxlen, ^);
- break;
- default:
- /* trouble! */
- break;
- }
- }
- else {
- /* make this 3 more cases, rather than 6 */
- STRING *larger;
- STRING *smaller;
-
- if (s1->representation > s2->representation) {
- larger = s1;
- smaller = s2;
- }
- else - {
- larger = s2;
- smaller = s1;
- }
-
-