cvsuser 05/03/01 09:25:47
Modified: charset ascii.c binary.c iso-8859-1.c
encodings utf8.c utf8.h
include/parrot charset.h encoding.h
src encoding.c string.c
Log:
Strings. Finally. 10 - stub in utf8 encoding
* dummy utf8 encoding functions
* attach a preferred_encoding to the charset structure
* use it in string.c
Revision Changes Path
1.15 +78 -61 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -r1.14 -r1.15
--- ascii.c 1 Mar 2005 15:41:25 -0000 1.14
+++ ascii.c 1 Mar 2005 17:25:44 -0000 1.15
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.14 2005/03/01 15:41:25 leo Exp $
+$Id: ascii.c,v 1.15 2005/03/01 17:25:44 leo Exp $
=head1 NAME
@@no-spam -19,8 +19,16 @@no-spam
#include "ascii.h"
#include <assert.h>
-/* The encoding we prefer, given a choice */
-static ENCODING *preferred_encoding;
+#ifdef EXCEPTION
+# undef EXCEPTION
+#endif
+
+/*
+ * TODO check interpreter error and warnings setting
+ */
+
+#define EXCEPTION(err, str) \
+ real_exception(interpreter, NULL, err, str)
#define WHITESPACE 1
#define WORDCHAR 2
@@no-spam -50,7 +58,6 @@no-spam
ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
unsigned char type, const unsigned char *table)
{
-
for (; start < string->strlen; start++) {
if (table[ENCODING_GET_BYTE(interpreter, string, start)] & type) {
return start;
@@no-spam -97,10 +104,26 @@no-spam
static STRING *
-from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
+from_charset(Interp *interpreter, STRING *src, STRING *dest)
{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
+ UINTVAL offs, c;
+ if (dest) {
+ Parrot_reallocate_string(interpreter, dest, src->strlen);
+ dest->bufused = src->strlen;
+ dest->strlen = src->strlen;
+ }
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = ENCODING_GET_CODEPOINT(interpreter, src, offs);
+ if (c >= 0x80) {
+ EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
+ }
+ if (dest)
+ ENCODING_SET_BYTE(interpreter, dest, offs, c);
+ }
+ if (dest)
+ return dest;
+ src->charset = Parrot_ascii_charset_ptr;
+ return src;
}
static STRING *
@@no-spam -255,7 +278,7 @@no-spam
UINTVAL cl, cr;
for (offs = 0; offs < min_len; ++offs) {
cl = ENCODING_GET_BYTE(interpreter, lhs, offs);
- cr = ENCODING_GET_BYTE(interpreter, rhs, offs);
+ cr = ENCODING_GET_CODEPOINT(interpreter, rhs, offs);
retval = cl - cr;
if (retval)
break;
@@no-spam -510,59 +533,53 @@no-spam
CHARSET *
Parrot_charset_ascii_init(Interp *interpreter)
{
- CHARSET *return_set = Parrot_new_charset(interpreter);
- CHARSET base_set = {
- "ascii",
- ascii_get_graphemes,
- ascii_get_graphemes_inplace,
- set_graphemes,
- ascii_to_charset,
- ascii_to_unicode,
- from_charset,
- from_unicode,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- ascii_compare,
- ascii_cs_index,
- ascii_cs_rindex,
- validate,
- is_wordchar,
- find_wordchar,
- find_not_wordchar,
- is_whitespace,
- find_whitespace,
- find_not_whitespace,
- is_digit,
- find_digit,
- find_not_digit,
- is_punctuation,
- find_punctuation,
- find_not_punctuation,
- ascii_is_newline,
- ascii_find_newline,
- ascii_find_not_newline,
- find_word_boundary,
- string_from_codepoint,
- ascii_compute_hash,
- {NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}
- };
-
- /* Snag the global. This is... bad. Should be properly fixed at some
- point */
- preferred_encoding = Parrot_fixed_8_encoding_ptr;
-
-/* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
-
- memcpy(return_set, &base_set, sizeof(CHARSET));
- Parrot_register_charset(interpreter, "ascii", return_set);
- return return_set;
+ CHARSET *return_set = Parrot_new_charset(interpreter);
+ static const CHARSET base_set = {
+ "ascii",
+ ascii_get_graphemes,
+ ascii_get_graphemes_inplace,
+ set_graphemes,
+ ascii_to_charset,
+ ascii_to_unicode,
+ from_charset,
+ from_unicode,
+ compose,
+ decompose,
+ upcase,
+ downcase,
+ titlecase,
+ upcase_first,
+ downcase_first,
+ titlecase_first,
+ ascii_compare,
+ ascii_cs_index,
+ ascii_cs_rindex,
+ validate,
+ is_wordchar,
+ find_wordchar,
+ find_not_wordchar,
+ is_whitespace,
+ find_whitespace,
+ find_not_whitespace,
+ is_digit,
+ find_digit,
+ find_not_digit,
+ is_punctuation,
+ find_punctuation,
+ find_not_punctuation,
+ ascii_is_newline,
+ ascii_find_newline,
+ ascii_find_not_newline,
+ find_word_boundary,
+ string_from_codepoint,
+ ascii_compute_hash,
+ NULL
+ };
+
+ memcpy(return_set, &base_set, sizeof(CHARSET));
+ return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
+ Parrot_register_charset(interpreter, "ascii", return_set);
+ return return_set;
}
STRING *
1.11 +48 -58 parrot/charset/binary.c
Index: binary.c
===================================================================
RCS file: /cvs/public/parrot/charset/binary.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- binary.c 1 Mar 2005 14:19:45 -0000 1.10
+++ binary.c 1 Mar 2005 17:25:44 -0000 1.11
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: binary.c,v 1.10 2005/03/01 14:19:45 leo Exp $
+$Id: binary.c,v 1.11 2005/03/01 17:25:44 leo Exp $
=head1 NAME
@@no-spam -18,9 +18,6 @@no-spam
#include "binary.h"
#include "ascii.h"
-/* The encoding we prefer, given a choice */
-static ENCODING *preferred_encoding;
-
#ifdef EXCEPTION
# undef EXCEPTION
#endif
@@no-spam -249,60 +246,53 @@no-spam
CHARSET *
Parrot_charset_binary_init(Interp *interpreter)
{
- CHARSET *return_set = Parrot_new_charset(interpreter);
- CHARSET base_set = {
- "binary",
- ascii_get_graphemes,
- ascii_get_graphemes_inplace,
- set_graphemes,
- to_charset,
- to_unicode,
- from_charset,
- from_unicode,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- compare,
- cs_index,
- cs_rindex,
- validate,
- is_wordchar,
- find_wordchar,
- find_not_wordchar,
- is_whitespace,
- find_whitespace,
- find_not_whitespace,
- is_digit,
- find_digit,
- find_not_digit,
- is_punctuation,
- find_punctuation,
- find_not_punctuation,
- is_newline,
- find_newline,
- find_not_newline,
- find_word_boundary,
- string_from_codepoint,
- ascii_compute_hash,
- {NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}
- };
-
- /* Snag the global. This is... bad. Should be properly fixed at some
- point */
- preferred_encoding = Parrot_fixed_8_encoding_ptr;
-
-/* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
-
-
- memcpy(return_set, &base_set, sizeof(CHARSET));
- Parrot_register_charset(interpreter, "binary", return_set);
- return return_set;
+ CHARSET *return_set = Parrot_new_charset(interpreter);
+ static const CHARSET base_set = {
+ "binary",
+ ascii_get_graphemes,
+ ascii_get_graphemes_inplace,
+ set_graphemes,
+ to_charset,
+ to_unicode,
+ from_charset,
+ from_unicode,
+ compose,
+ decompose,
+ upcase,
+ downcase,
+ titlecase,
+ upcase_first,
+ downcase_first,
+ titlecase_first,
+ compare,
+ cs_index,
+ cs_rindex,
+ validate,
+ is_wordchar,
+ find_wordchar,
+ find_not_wordchar,
+ is_whitespace,
+ find_whitespace,
+ find_not_whitespace,
+ is_digit,
+ find_digit,
+ find_not_digit,
+ is_punctuation,
+ find_punctuation,
+ find_not_punctuation,
+ is_newline,
+ find_newline,
+ find_not_newline,
+ find_word_boundary,
+ string_from_codepoint,
+ ascii_compute_hash,
+ NULL
+ };
+
+ memcpy(return_set, &base_set, sizeof(CHARSET));
+ return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
+ Parrot_register_charset(interpreter, "binary", return_set);
+ return return_set;
}
1.12 +4 -14 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- iso-8859-1.c 1 Mar 2005 15:41:25 -0000 1.11
+++ iso-8859-1.c 1 Mar 2005 17:25:44 -0000 1.12
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.11 2005/03/01 15:41:25 leo Exp $
+$Id: iso-8859-1.c,v 1.12 2005/03/01 17:25:44 leo Exp $
=head1 NAME
@@no-spam -29,9 +29,6 @@no-spam
#define EXCEPTION(err, str) \
real_exception(interpreter, NULL, err, str)
-/* The encoding we prefer, given a choice */
-static ENCODING *preferred_encoding;
-
#define WHITESPACE 1
#define WORDCHAR 2
#define PUNCTUATION 4
@@no-spam -340,7 +337,7 @@no-spam
Parrot_charset_iso_8859_1_init(Interp *interpreter)
{
CHARSET *return_set = Parrot_new_charset(interpreter);
- CHARSET base_set = {
+ static const CHARSET base_set = {
"iso-8859-1",
ascii_get_graphemes,
ascii_get_graphemes_inplace,
@@no-spam -379,18 +376,11 @@no-spam
find_word_boundary,
string_from_codepoint,
ascii_compute_hash,
- {NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}
-
+ NULL
};
- /* Snag the global. This is... bad. Should be properly fixed at some
- point */
- preferred_encoding = Parrot_fixed_8_encoding_ptr;
-
- /* preferred_encoding = Parrot_load_encoding(interpreter, "fixed_8"); */
-
memcpy(return_set, &base_set, sizeof(CHARSET));
+ return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
Parrot_register_charset(interpreter, "iso-8859-1", return_set);
return return_set;
}
1.20 +175 -5 parrot/encodings/utf8.c
Index: utf8.c
===================================================================
RCS file: /cvs/public/parrot/encodings/utf8.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -u -r1.19 -r1.20
--- utf8.c 9 Apr 2004 20:32:08 -0000 1.19
+++ utf8.c 1 Mar 2005 17:25:45 -0000 1.20
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: utf8.c,v 1.19 2004/04/09 20:32:08 dan Exp $
+$Id: utf8.c,v 1.20 2005/03/01 17:25:45 leo Exp $
=head1 NAME
@@no-spam -20,6 +20,9 @@no-spam
#include "parrot/parrot.h"
#include "parrot/unicode.h"
+#include "utf8.h"
+
+#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl utf8")
const char Parrot_utf8skip[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */
@@no-spam -56,10 +59,10 @@no-spam
*/
static UINTVAL
-utf8_characters(const void *ptr, UINTVAL bytes)
+utf8_characters(const void *ptr, UINTVAL byte_offs)
{
const utf8_t *u8ptr = ptr;
- const utf8_t *u8end = u8ptr + bytes;
+ const utf8_t *u8end = u8ptr + byte_offs;
UINTVAL characters = 0;
while (u8ptr < u8end) {
@@no-spam -284,14 +287,181 @@no-spam
i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
}
+
+/* This function needs to go through and get all the code points one
+ by one and turn them into a byte */
+static void
+to_encoding(Interp *interpreter, STRING *src)
+{
+ UNIMPL;
+}
+
+static STRING *
+copy_to_encoding(Interp *interpreter, STRING *src)
+{
+ STRING *return_string = NULL;
+
+ UNIMPL;
+ return return_string;
+}
+
+/* codepoints are bytes, so delegate */
+static UINTVAL
+get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ const void *start;
+
+ start = utf8_skip_forward(src->strstart, offset);
+ return utf8_decode(start);
+}
+
+static void
+set_codepoint(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL codepoint)
+{
+ UNIMPL;
+}
+
+static UINTVAL
+get_byte(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ unsigned char *contents = src->strstart;
+ if (offset >= src->bufused) {
+ internal_exception(0,
+ "get_byte past the end of the buffer (%i of %i)",
+ offset, src->bufused);
+ }
+ return contents[offset];
+}
+
+static void
+set_byte(Interp *interpreter, const STRING *src,
+ UINTVAL offset, UINTVAL byte)
+{
+ unsigned char *contents;
+ if (offset >= src->bufused) {
+ internal_exception(0, "set_byte past the end of the buffer");
+ }
+ contents = src->strstart;
+ contents[offset] = byte;
+}
+
+static STRING *
+get_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ STRING *return_string = NULL;
+ UNIMPL;
+ return return_string;
+}
+
+static STRING *
+get_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ STRING *return_string = Parrot_make_COW_reference(interpreter,
+ src);
+ return_string->encoding = src->encoding; /* XXX */
+ return_string->charset = src->charset;
+
+ return_string->strstart = (char *)return_string->strstart + offset ;
+ return_string->bufused = count;
+
+ return_string->strlen = count;
+ return_string->hashval = 0;
+
+ return return_string;
+}
+
+
+static STRING *
+get_codepoints_inplace(Interp *interpreter, STRING *src,
+ STRING *dest_string, UINTVAL offset, UINTVAL count)
+{
+
+ UNIMPL;
+ return NULL;
+}
+
+static STRING *
+get_bytes_inplace(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *return_string)
+{
+ UNIMPL;
+ return NULL;
+}
+
+static void
+set_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_codepoints)
+{
+ UNIMPL;
+}
+
+static void
+set_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_bytes)
+{
+ UNIMPL;
+}
+
+/* Unconditionally makes the string be in this encoding, if that's
+ valid */
+static void
+become_encoding(Interp *interpreter, STRING *src)
+{
+ UNIMPL;
+}
+
+
+static UINTVAL
+codepoints(Interp *interpreter, STRING *src)
+{
+ return src->strlen;
+}
+
+static UINTVAL
+bytes(Interp *interpreter, STRING *src)
+{
+ return src->bufused;
+}
+
+ENCODING *
+Parrot_encoding_utf8_init(Interp *interpreter)
+{
+ ENCODING *return_encoding = Parrot_new_encoding(interpreter);
+
+ static const ENCODING base_encoding = {
+ "utf8",
+ 6, /* Max bytes per codepoint */
+ to_encoding,
+ copy_to_encoding,
+ get_codepoint,
+ set_codepoint,
+ get_byte,
+ set_byte,
+ get_codepoints,
+ get_codepoints_inplace,
+ get_bytes,
+ get_bytes_inplace,
+ set_codepoints,
+ set_bytes,
+ become_encoding,
+ codepoints,
+ bytes
+ };
+ memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+ Parrot_register_encoding(interpreter, "utf8", return_encoding);
+ return return_encoding;
+}
+
/*
=back
=head1 SEE ALSO
-F<encodings/utf16.c>,
-F<encodings/utf32.c>,
+F<encodings/fixed_8.c>,
F<src/string.c>,
F<include/parrot/string.h>,
F<docs/string.pod>.
1.2 +2 -16 parrot/encodings/utf8.h
Index: utf8.h
===================================================================
RCS file: /cvs/public/parrot/encodings/utf8.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- utf8.h 4 Nov 2004 18:37:50 -0000 1.1
+++ utf8.h 1 Mar 2005 17:25:45 -0000 1.2
@@no-spam -1,7 +1,7 @@no-spam
/* utf8.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: utf8.h,v 1.1 2004/11/04 18:37:50 dan Exp $
+ * $Id: utf8.h,v 1.2 2005/03/01 17:25:45 leo Exp $
* Overview:
* This is the header for the utf8 variable-width encoding.
* Data Structure and Algorithms:
@@no-spam -13,21 +13,7 @@no-spam
#if !defined(PARROT_ENCODING_UTF8_H_GUARD)
#define PARROT_ENCODING_UTF8_H_GUARD
-static void to_encoding(Interp *interpreter, STRING *source_string);
-static STRING *copy_to_encoding(Interp *interpreter, STRING *source_string);
-static UINTVAL get_codepoint(Interp *interpreter, STRING *source_string, UINTVAL offset);
-static void set_codepoint(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL codepoint);
-static UINTVAL get_byte(Interp *interpreter, STRING *source_string, UINTVAL offset);
-static void set_byte(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL byte);
-static STRING *get_codepoints(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);
-static STRING *get_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count);
-static void set_codepoints(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_codepoints);
-static void set_bytes(Interp *interpreter, STRING *source_string, UINTVAL offset, UINTVAL count, STRING *new_bytes);
-static void become_encoding(Interp *interpreter, STRING *source_string);
-static UINTVAL codepoints(Interp *interpreter, STRING *source_string);
-static UINTVAL bytes(Interp *interpreter, STRING *source_string);
-ENCODING *Parrot_encoding_utf8_init(Interp *interpreter);
-
+ENCODING *Parrot_encoding_utf8_init(Interp *);
#endif /* PARROT_ENCODING_UTF8_H_GUARD */
/*
1.10 +4 -2 parrot/include/parrot/charset.h
Index: charset.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/charset.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- charset.h 1 Mar 2005 14:19:46 -0000 1.9
+++ charset.h 1 Mar 2005 17:25:46 -0000 1.10
@@no-spam -1,7 +1,7 @@no-spam
/* charset.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: charset.h,v 1.9 2005/03/01 14:19:46 leo Exp $
+ * $Id: charset.h,v 1.10 2005/03/01 17:25:46 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding
* Data Structure and Algorithms:
@@no-spam -131,7 +131,7 @@no-spam
charset_find_word_boundary_t find_word_boundary;
charset_string_from_codepoint_t string_from_codepoint;
charset_compute_hash_t compute_hash;
- ENCODING encoding_overrides;
+ ENCODING *preferred_encoding;
};
#define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET *)source->charset)->get_graphemes(interpreter, source, offset, count)
@@no-spam -168,6 +168,8 @@no-spam
#define CHARSET_FIND_NOT_NEWLINE(interp, source, offset) ((CHARSET *)source->charset)->find_not_newline(interpreter, source, offset)
#define CHARSET_FIND_WORD_BOUNDARY(interp, source, offset) ((CHARSET *)source->charset)->find_word_boundary(interpreter, source, offset)
#define CHARSET_COMPUTE_HASH(interp, source) ((CHARSET *)source->charset)->compute_hash(interpreter, source)
+#define CHARSET_GET_PREFERRED_ENCODING(interp, source) ((CHARSET *)source->charset)->preferred_encoding
+
#define CHARSET_TO_ENCODING(interp, source, offset, count) ((ENCODING *)source->encoding)->to_encoding(interp, source, offset, count)
#define CHARSET_COPY_TO_ENCODING(interp, source) ((ENCODING *)source->encoding)->copy_to_encoding(interp, source)
#define CHARSET_GET_CODEPOINT(interp, source, offset) ((ENCODING *)source->encoding)->get_codepoint(interp, source, offset)
1.32 +2 -1 parrot/include/parrot/encoding.h
Index: encoding.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/encoding.h,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -r1.31 -r1.32
--- encoding.h 27 Feb 2005 12:12:28 -0000 1.31
+++ encoding.h 1 Mar 2005 17:25:46 -0000 1.32
@@no-spam -1,7 +1,7 @@no-spam
/* encoding.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: encoding.h,v 1.31 2005/02/27 12:12:28 leo Exp $
+ * $Id: encoding.h,v 1.32 2005/03/01 17:25:46 leo Exp $
* Overview:
* This is the header for the generic encoding functions
* Data Structure and Algorithms:
@@no-spam -55,6 +55,7 @@no-spam
#if !defined PARROT_NO_EXTERN_ENCODING_PTRS
extern ENCODING *Parrot_fixed_8_encoding_ptr;
+extern ENCODING *Parrot_utf8_encoding_ptr;
#endif
#define PARROT_DEFAULT_ENCODING Parrot_fixed_8_encoding_ptr
1.24 +9 -1 parrot/src/encoding.c
Index: encoding.c
===================================================================
RCS file: /cvs/public/parrot/src/encoding.c,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- encoding.c 27 Feb 2005 12:12:29 -0000 1.23
+++ encoding.c 1 Mar 2005 17:25:47 -0000 1.24
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: encoding.c,v 1.23 2005/02/27 12:12:29 leo Exp $
+$Id: encoding.c,v 1.24 2005/03/01 17:25:47 leo Exp $
=head1 NAME
@@no-spam -17,6 +17,7 @@no-spam
ENCODING *Parrot_default_encoding_ptr;
ENCODING *Parrot_fixed_8_encoding_ptr;
+ENCODING *Parrot_utf8_encoding_ptr;
/* Yep, this needs to be a char * parameter -- it's tough to load in
encodings and such for strings if we can't be sure we've got enough
@@no-spam -34,6 +35,9 @@no-spam
if (!strcmp("fixed_8", encodingname)) {
return Parrot_fixed_8_encoding_ptr;
}
+ if (!strcmp("utf8", encodingname)) {
+ return Parrot_fixed_8_encoding_ptr;
+ }
return NULL;
}
@@no-spam -56,6 +60,10 @@no-spam
}
return 1;
}
+ if (!strcmp("utf8", encodingname)) {
+ Parrot_utf8_encoding_ptr = encoding;
+ return 1;
+ }
return 0;
}
1.239 +7 -7 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.238
retrieving revision 1.239
diff -u -r1.238 -r1.239
--- string.c 1 Mar 2005 14:19:48 -0000 1.238
+++ string.c 1 Mar 2005 17:25:47 -0000 1.239
@@no-spam -1,6 +1,6 @@no-spam
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.238 2005/03/01 14:19:48 leo Exp $
+$Id: string.c,v 1.239 2005/03/01 17:25:47 leo Exp $
=head1 NAME
@@no-spam -295,6 +295,8 @@no-spam
* 2) default charset = iso-8859-1
*/
Parrot_encoding_fixed_8_init(interpreter);
+ Parrot_encoding_utf8_init(interpreter);
+
Parrot_charset_iso_8859_1_init(interpreter);
Parrot_charset_binary_init(interpreter);
Parrot_charset_ascii_init(interpreter);
@@no-spam -403,8 +405,8 @@no-spam
s = new_string_header(interpreter, 0);
if (representation == enum_stringrep_one) {
- s->encoding = PARROT_DEFAULT_ENCODING;
s->charset = PARROT_DEFAULT_CHARSET;
+ s->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, s);;
} else {
internal_exception(INVALID_CHARTYPE, "Unsupported representation");
}
@@no-spam -628,21 +630,19 @@no-spam
}
if (strcmp(charset_name, "iso-8859-1") == 0 ) {
- encoding = Parrot_fixed_8_encoding_ptr;
charset = Parrot_iso_8859_1_charset_ptr;
}
else if (strcmp(charset_name, "ascii") == 0 ) {
- encoding = Parrot_fixed_8_encoding_ptr;
charset = Parrot_ascii_charset_ptr;
}
else if (strcmp(charset_name, "binary") == 0 ) {
- encoding = Parrot_fixed_8_encoding_ptr;
charset = Parrot_binary_charset_ptr;
}
else {
internal_exception(UNIMPLEMENTED,
"Can't make '%s' charset strings", charset_name);
}
+ encoding = charset->preferred_encoding;
return string_make_direct(interpreter, buffer, len,
encoding, charset, flags);
@@no-spam -2615,8 +2615,8 @@no-spam
return dest;
}
dest->charset = new_charset;
- /* XXX prefered encoding for charset */
- dest->encoding = PARROT_DEFAULT_ENCODING;
+ /* get prefered encoding for charset */
+ dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, dest);
}
else {
if (new_charset == src->charset) {