PERL CVS PARROT 44 CVS COMMIT PARROT T OP STRING CS T
Date: 1 Mar 2005 14:19:49 -0000

Subject: cvs commit: parrot/t/op string_cs.t
From: leo@no-spam (Leopold Toetsch)

cvsuser 05/03/01 06:19:49

Modified: charset ascii.c ascii.h binary.c binary.h iso-8859-1.c iso-8859-1.h include/parrot charset.h exceptions.h string_funcs.h ops ops.num string.ops src charset.c string.c t/op string_cs.t Log:
Strings. Finally. 8 - charset conversion * trans_charset opcodes * Parrot_string_trans_charset() interface * charset converter registration and lookup * adapt converter function signature to take a dest STRING * iso-8859-1 to ascii conversion Revision Changes Path 1.13 +13 -18 parrot/charset/ascii.c Index: ascii.c ===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v retrieving revision 1.12
retrieving revision 1.13
diff -u -r1.12 -r1.13
--- ascii.c 1 Mar 2005 11:06:26 -0000 1.12
+++ ascii.c 1 Mar 2005 14:19:45 -0000 1.13
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.12 2005/03/01 11:06:26 leo Exp $
+$Id: ascii.c,v 1.13 2005/03/01 14:19:45 leo Exp $
=head1 NAME @@no-spam -95,37 +95,33 @@no-spam offset, count, dest_string);
}
-static void -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING *
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING *dest)

{
internal_exception(UNIMPLEMENTED, "to_charset for ascii not implemented");

+ return NULL;
}
-static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
-{
- STRING *return_string = NULL;
-
- return return_string;
-}
-static void -to_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "to_unicode for ascii not implemented");

+ return NULL;
}
-static void -from_charset(Interp *interpreter, STRING *source_string)
+static STRING *
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void -from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
/* A noop. can't compose ascii */
@@no-spam -511,7 +507,6 @@no-spam ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
1.10 +1 -5 parrot/charset/ascii.h Index: ascii.h ===================================================================
RCS file: /cvs/public/parrot/charset/ascii.h,v retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- ascii.h 1 Mar 2005 11:06:26 -0000 1.9
+++ ascii.h 1 Mar 2005 14:19:45 -0000 1.10
@@no-spam -1,7 +1,7 @@no-spam /* ascii.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: ascii.h,v 1.9 2005/03/01 11:06:26 leo Exp $
+ * $Id: ascii.h,v 1.10 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the ascii charset functions * Data Structure and Algorithms:
@@no-spam -40,10 +40,6 @@no-spam const STRING *search_string, UINTVAL offset);
size_t ascii_compute_hash(Interp *, STRING *source_string);
-static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string);

-static void to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
1.10 +13 -20 parrot/charset/binary.c Index: binary.c ===================================================================
RCS file: /cvs/public/parrot/charset/binary.c,v retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- binary.c 1 Mar 2005 11:06:26 -0000 1.9
+++ binary.c 1 Mar 2005 14:19:45 -0000 1.10
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: binary.c,v 1.9 2005/03/01 11:06:26 leo Exp $
+$Id: binary.c,v 1.10 2005/03/01 14:19:45 leo Exp $
=head1 NAME @@no-spam -36,38 +36,32 @@no-spam replace_count, insert_string);
}
-static void -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING*
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING *dest)

{
internal_exception(UNIMPLEMENTED, "to_charset for binary not implemented");

+ return NULL;
}
-static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
-{
- STRING *return_string = NULL;
- internal_exception(UNIMPLEMENTED,
- "copy_to_charset for binary not implemented");
- return return_string;
-}
-
-static void -to_unicode(Interp *interpreter, STRING *source_string)
+static STRING*
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "to_unicode for binary not implemented");

+ return NULL;
}
-static void -from_charset(Interp *interpreter, STRING *source_string)
+static STRING*
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void -from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
/* A noop. can't compose binary */
@@no-spam -262,7 +256,6 @@no-spam ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
1.7 +1 -4 parrot/charset/binary.h Index: binary.h ===================================================================
RCS file: /cvs/public/parrot/charset/binary.h,v retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- binary.h 1 Mar 2005 11:06:26 -0000 1.6
+++ binary.h 1 Mar 2005 14:19:45 -0000 1.7
@@no-spam -1,7 +1,7 @@no-spam /* binary.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: binary.h,v 1.6 2005/03/01 11:06:26 leo Exp $
+ * $Id: binary.h,v 1.7 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the binary charset functions * Data Structure and Algorithms:
@@no-spam -13,9 +13,6 @@no-spam #if !defined(PARROT_CHARSET_BINARY_H_GUARD)
#define PARROT_CHARSET_BINARY_H_GUARD -static void to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
1.10 +56 -31 parrot/charset/iso-8859-1.c Index: iso-8859-1.c ===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- iso-8859-1.c 1 Mar 2005 11:06:26 -0000 1.9
+++ iso-8859-1.c 1 Mar 2005 14:19:45 -0000 1.10
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.9 2005/03/01 11:06:26 leo Exp $
+$Id: iso-8859-1.c,v 1.10 2005/03/01 14:19:45 leo Exp $
=head1 NAME @@no-spam -18,6 +18,17 @@no-spam #include "iso-8859-1.h"
#include "ascii.h"
+#ifdef EXCEPTION +# undef EXCEPTION +#endif +
+/*
+ * TODO check interpreter error and warnings setting + */
+
+#define EXCEPTION(err, str) \
+ real_exception(interpreter, NULL, err, str)
+
/* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
@@no-spam -55,51 +66,43 @@no-spam replace_count, insert_string);
}
-static void -from_charset(Interp *interpreter, STRING *source_string)
+static STRING *
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void -from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING *
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
- charset_converter_t conversion_func;
- if ((conversion_func = Parrot_find_charset_converter(interpreter,
- source_string->charset, new_charset))) {
- /*
- * XXX conversion_func has wrong signature ?
- *
- * conversion_func(interpreter, new_charset, source_string);
- */
- }
- else {
- to_unicode(interpreter, source_string);
- new_charset->from_charset(interpreter, source_string);
- }
+ internal_exception(UNIMPLEMENTED,
+ "to_unicode for iso-8859-1 not implemented");
+ return NULL;
}
static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING *dest)

{
- STRING *return_string = NULL;
+ charset_converter_t conversion_func;
- return return_string;
-}
+ if ((conversion_func = Parrot_find_charset_converter(interpreter,
+ src->charset, new_charset))) {
+ return conversion_func(interpreter, src, dest);
+ }
+ else {
+ STRING *res = to_unicode(interpreter, src, dest);
+ return new_charset->from_charset(interpreter, res, dest);
-static void -to_unicode(Interp *interpreter, STRING *source_string)
-{
- internal_exception(UNIMPLEMENTED,
- "to_unicode for iso-8859-1 not implemented");
+ }
}
/* A noop. can't compose iso-8859-1 */
@@no-spam -367,7 +370,6 @@no-spam ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
@@no-spam -417,6 +419,29 @@no-spam return return_set;
}
+STRING *
+charset_cvt_iso_8859_1_to_ascii(Interp *interpreter, STRING *src, STRING *dest)

+{
+ UINTVAL offs, c;
+ if (dest) {
+ Parrot_reallocate_string(interpreter, dest, src->strlen);
+ dest->bufused = src->bufused;
+ dest->strlen = src->strlen;
+ }
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = ENCODING_GET_BYTE(interpreter, src, offs);
+ if (c >= 0x80) {
+ EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
+ }
+ if (dest)
+ ENCODING_SET_BYTE(interpreter, dest, offs, c);
+ }
+ if (dest)
+ return dest;
+ src->charset = Parrot_ascii_charset_ptr;
+ return src;
+}
+
/*
* Local variables:
* c-indentation-style: bsd 1.8 +3 -4 parrot/charset/iso-8859-1.h Index: iso-8859-1.h ===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- iso-8859-1.h 1 Mar 2005 11:06:26 -0000 1.7
+++ iso-8859-1.h 1 Mar 2005 14:19:45 -0000 1.8
@@no-spam -1,7 +1,7 @@no-spam /* iso_8859_1.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: iso-8859-1.h,v 1.7 2005/03/01 11:06:26 leo Exp $
+ * $Id: iso-8859-1.h,v 1.8 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the iso_8859-1 charset functions * Data Structure and Algorithms:
@@no-spam -14,9 +14,6 @@no-spam #define PARROT_CHARSET_ISO_8859_1_H_GUARD static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string);

-static void to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET *new_charset);

-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
@@no-spam -39,6 +36,8 @@no-spam static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL offset);

static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL offset);

+STRING *charset_cvt_iso_8859_1_to_ascii(Interp *, STRING *src, STRING *dest);

+
CHARSET *Parrot_charset_iso_8859_1_init(Interp *);
#endif /* PARROT_CHARSET_ISO_8859_1_H_GUARD */
1.9 +15 -11 parrot/include/parrot/charset.h Index: charset.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/charset.h,v retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- charset.h 1 Mar 2005 08:30:56 -0000 1.8
+++ charset.h 1 Mar 2005 14:19:46 -0000 1.9
@@no-spam -1,7 +1,7 @@no-spam /* charset.h * Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: charset.h,v 1.8 2005/03/01 08:30:56 leo Exp $
+ * $Id: charset.h,v 1.9 2005/03/01 14:19:46 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding * Data Structure and Algorithms:
@@no-spam -35,11 +35,14 @@no-spam typedef STRING *(*charset_get_graphemes_t)(Interp *, STRING *source_string, UINTVAL offset, UINTVAL count);

typedef STRING *(*charset_get_graphemes_inplace_t)(Interp *, STRING *source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);

typedef void (*charset_set_graphemes_t)(Interp *, STRING *source_string, UINTVAL offset, UINTVAL replace_count, STRING *insert_string);

-typedef void (*charset_to_charset_t)(Interp *, STRING *source_string, CHARSET *new_charset);

-typedef STRING *(*charset_copy_to_charset_t)(Interp *, STRING *source_string, CHARSET *new_charset);

-typedef void (*charset_to_unicode_t)(Interp *, STRING *source_string);
-typedef void (*charset_from_charset_t)(Interp *, STRING *source_string);
-typedef void (*charset_from_unicode_t)(Interp *, STRING *source_string);
+
+typedef STRING * (*charset_to_charset_t)(Interp *, STRING *source_string,
+ CHARSET *new_charset, STRING *dest);
+typedef STRING * (*charset_to_unicode_t)(Interp *, STRING *src, STRING *dest);

+typedef STRING * (*charset_from_charset_t)(Interp *, STRING *source_string,
+ STRING *dest);
+typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
+ STRING *dest);
typedef void (*charset_compose_t)(Interp *, STRING *source_string);
typedef void (*charset_decompose_t)(Interp *, STRING *source_string);
typedef void (*charset_upcase_t)(Interp *, STRING *source_string);
@@no-spam -77,14 +80,17 @@no-spam INTVAL Parrot_register_charset(Interp *, const char *charsetname, CHARSET *charset);

INTVAL Parrot_make_default_charset(Interp *, const char *charsetname, CHARSET *charset);

CHARSET *Parrot_default_charset(Interp *);
-typedef INTVAL (*charset_converter_t)(Interp *, CHARSET *lhs, CHARSET *rhs);
+typedef STRING* (*charset_converter_t)(Interp *, STRING *src, STRING *dst);
charset_converter_t Parrot_find_charset_converter(Interp *, CHARSET *lhs, CHARSET *rhs);

+void Parrot_register_charset_converter(Interp *,
+ CHARSET *lhs, CHARSET *rhs, charset_converter_t func);
void Parrot_deinit_charsets(Interp *);
INTVAL Parrot_charset_number(Interp *, STRING *charsetname);
STRING* Parrot_charset_name(Interp *, INTVAL);
const char* Parrot_charset_c_name(Interp *, INTVAL);
INTVAL Parrot_charset_number_of_str(Interp *, STRING *src);
+CHARSET* Parrot_get_charset(Interp *, INTVAL number_of_charset);
struct _charset {
const char *name;
@@no-spam -92,7 +98,6 @@no-spam charset_get_graphemes_inplace_t get_graphemes_inplace;
charset_set_graphemes_t set_graphemes;
charset_to_charset_t to_charset;
- charset_copy_to_charset_t copy_to_charset;
charset_to_unicode_t to_unicode;
charset_from_charset_t from_charset;
charset_from_unicode_t from_unicode;
@@no-spam -132,9 +137,8 @@no-spam #define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET *)source->charset)->get_graphemes(interpreter, source, offset, count)

#define CHARSET_GET_GRAPHEMES_INPLACE(interp, source, dest, offset, count) ((CHARSET *)source->charset)->get_graphemes(interpreter, source, dest, offset, count)

#define CHARSET_SET_GRAPHEMES(interp, source, offset, replace_count, insert) ((CHARSET *)source->charset)->set_graphemes(interpreter, source, offset, replace_count, insert)

-#define CHARSET_TO_CHARSET(interp, source, new_charset) ((CHARSET *)source->charset)->to_charset(interpreter, source, new_charset)

-#define CHARSET_COPY_TO_CHARSET(interp, source, new_charset) ((CHARSET *)source->charset)->copy_to_charset(interpreter, source, new_charset)

-#define CHARSET_TO_UNICODE(interp, source) ((CHARSET *)source->charset)->to_unicode(interpreter, source)

+#define CHARSET_TO_CHARSET(interp, source, new_charset, dest) ((CHARSET *)source->charset)->to_charset(interpreter, source, new_charset, dest)

+#define CHARSET_TO_UNICODE(interp, source, dest) ((CHARSET *)source->charset)->to_unicode(interpreter, source, dest)

#define CHARSET_COMPOSE(interp, source) ((CHARSET *)source->charset)->compose(interpreter, source)

#define CHARSET_DECOMPOSE(interp, source) ((CHARSET *)source->charset)->decompose(interpreter, source)

#define CHARSET_UPCASE(interp, source) ((CHARSET *)source->charset)->upcase(interpreter, source)

1.54 +3 -2 parrot/include/parrot/exceptions.h Index: exceptions.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/exceptions.h,v retrieving revision 1.53
retrieving revision 1.54
diff -u -r1.53 -r1.54
--- exceptions.h 6 Jan 2005 00:42:05 -0000 1.53
+++ exceptions.h 1 Mar 2005 14:19:46 -0000 1.54
@@no-spam -1,7 +1,7 @@no-spam /* exceptions.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: exceptions.h,v 1.53 2005/01/06 00:42:05 rubys Exp $
+ * $Id: exceptions.h,v 1.54 2005/03/01 14:19:46 leo Exp $
* Overview:
* define the internal interpreter exceptions * Data Structure and Algorithms:
@@no-spam -119,7 +119,8 @@no-spam WRITE_TO_CONSTCLASS,
NOSPAWN,
INTERNAL_NOT_IMPLEMENTED,
- ERR_OVERFLOW + ERR_OVERFLOW,
+ LOSSY_CONVERSION } exception_type_enum;
/* &end_gen */
1.50 +3 -1 parrot/include/parrot/string_funcs.h Index: string_funcs.h ===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v retrieving revision 1.49
retrieving revision 1.50
diff -u -r1.49 -r1.50
--- string_funcs.h 28 Feb 2005 18:01:22 -0000 1.49
+++ string_funcs.h 1 Mar 2005 14:19:46 -0000 1.50
@@no-spam -1,7 +1,7 @@no-spam /* string_funcs.h * Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info - * $Id: string_funcs.h,v 1.49 2005/02/28 18:01:22 leo Exp $
+ * $Id: string_funcs.h,v 1.50 2005/03/01 14:19:46 leo Exp $
* Overview:
* This is the api header for the string subsystem * Data Structure and Algorithms:
@@no-spam -115,6 +115,8 @@no-spam INTVAL Parrot_string_find_newline(Interp *, STRING *, INTVAL offset);
INTVAL Parrot_string_find_word_boundary(Interp *, STRING *, INTVAL offset);
+STRING* Parrot_string_trans_charset(Interp *, STRING *src,
+ INTVAL charset_nr, STRING *dest);
#endif /* PARROT_IN_CORE */
#endif /* PARROT_STRING_FUNCS_H_GUARD */
1.60 +6 -0 parrot/ops/ops.num Index: ops.num ===================================================================
RCS file: /cvs/public/parrot/ops/ops.num,v retrieving revision 1.59
retrieving revision 1.60
diff -u -r1.59 -r1.60
--- ops.num 28 Feb 2005 18:01:24 -0000 1.59
+++ ops.num 1 Mar 2005 14:19:47 -0000 1.60
@@no-spam -1422,3 +1422,9 @@no-spam find_word_boundary_i_s_ic 1392
find_word_boundary_i_sc_i 1393
find_word_boundary_i_sc_ic 1394
+trans_charset_s_i 1395
+trans_charset_s_ic 1396
+trans_charset_s_s_i 1397
+trans_charset_s_s_ic 1398
+trans_charset_s_sc_i 1399
+trans_charset_s_sc_ic 1400
1.35 +21 -0 parrot/ops/string.ops Index: string.ops ===================================================================
RCS file: /cvs/public/parrot/ops/string.ops,v retrieving revision 1.34
retrieving revision 1.35
diff -u -r1.34 -r1.35
--- string.ops 28 Feb 2005 18:01:24 -0000 1.34
+++ string.ops 1 Mar 2005 14:19:47 -0000 1.35
@@no-spam -634,6 +634,16 @@no-spam Return the charset number of the charset named $2. If the charset doesn't exit, throw an exception.
+=item B<trans_charset>(inout STR, in INT)
+
+Change the string to have the specified charset.
+
+=item B<trans_charset>(out STR, in STR, in INT)
+
+Create a string $1 from $2 with the specified charset.
+
+Both functions may throw an exception on information loss.
+
=cut op charset(out INT, in STR) :base_core {
@@no-spam -655,6 +665,17 @@no-spam goto NEXT();
}
+op trans_charset(inout STR, in INT) {
+ $1 = Parrot_string_trans_charset(interpreter, $1, $2, NULL);
+ goto NEXT();
+}
+
+op trans_charset(out STR, in STR, in INT) {
+ STRING *dest = new_string_header(interpreter, 0);
+ $1 = Parrot_string_trans_charset(interpreter, $2, $3, dest);
+ goto NEXT();
+}
+
=item B<is_whitespace>(out INT, in STR, in INT)
Set $1 to 1 if the codepoint of string $2 at offset $3 is whitespace.
1.9 +60 -1 parrot/src/charset.c Index: charset.c ===================================================================
RCS file: /cvs/public/parrot/src/charset.c,v retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- charset.c 1 Mar 2005 08:31:02 -0000 1.8
+++ charset.c 1 Mar 2005 14:19:48 -0000 1.9
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: charset.c,v 1.8 2005/03/01 08:31:02 leo Exp $
+$Id: charset.c,v 1.9 2005/03/01 14:19:48 leo Exp $
=head1 NAME @@no-spam -14,6 +14,7 @@no-spam #define PARROT_NO_EXTERN_CHARSET_PTRS #include "parrot/parrot.h"
+#include "../charset/iso-8859-1.h"
CHARSET *Parrot_iso_8859_1_charset_ptr;
CHARSET *Parrot_binary_charset_ptr;
@@no-spam -26,8 +27,15 @@no-spam */
typedef struct {
+ CHARSET *to;
+ charset_converter_t func;
+} To_converter;
+
+typedef struct {
CHARSET *charset;
STRING *name;
+ int n_converters;
+ To_converter *to_converters;
} One_charset;
typedef struct {
@@no-spam -126,6 +134,14 @@no-spam return all_charsets->set[number_of_charset].name;
}
+CHARSET*
+Parrot_get_charset(Interp *interpreter, INTVAL number_of_charset)
+{
+ if (number_of_charset >= all_charsets->n_charsets)
+ return NULL;
+ return all_charsets->set[number_of_charset].charset;
+}
+
const char *
Parrot_charset_c_name(Interp *interpreter, INTVAL number_of_charset)
{
@@no-spam -153,6 +169,7 @@no-spam all_charsets->n_charsets++;
all_charsets->set[n].charset = charset;
all_charsets->set[n].name = const_string(interpreter, charsetname);
+ all_charsets->set[n].n_converters = 0;
return 1;
}
@@no-spam -183,6 +200,9 @@no-spam }
if (!strcmp("ascii", charsetname)) {
Parrot_ascii_charset_ptr = charset;
+ Parrot_register_charset_converter(interpreter,
+ Parrot_iso_8859_1_charset_ptr, charset,
+ charset_cvt_iso_8859_1_to_ascii);
return register_charset(interpreter, charsetname, charset);
}
return 0;
@@no-spam -202,12 +222,51 @@no-spam return Parrot_default_charset_ptr;
}
+
charset_converter_t Parrot_find_charset_converter(Interp *interpreter, CHARSET *lhs, CHARSET *rhs)

{
+ int i, j, n, nc;
+
+ n = all_charsets->n_charsets;
+ for (i = 0; i < n; ++i) {
+ if (lhs == all_charsets->set[i].charset) {
+ One_charset *left = all_charsets->set + i;
+
+ nc = left->n_converters;
+ for (j = 0; j < nc; ++j) {
+ if (left->to_converters[j].to == rhs)
+ return left->to_converters[j].func;
+ }
+ }
+ }
return NULL;
}
+void +Parrot_register_charset_converter(Interp *interpreter,
+ CHARSET *lhs, CHARSET *rhs, charset_converter_t func)
+{
+ int i, n, nc;
+
+ n = all_charsets->n_charsets;
+ for (i = 0; i < n; ++i) {
+ if (lhs == all_charsets->set[i].charset) {
+ One_charset *left = all_charsets->set + i;
+
+ nc = left->n_converters++;
+ if (nc) {
+ left->to_converters = mem_sys_realloc(left->to_converters,
+ sizeof(To_converter) * (nc + 1));
+ }
+ else + left->to_converters = mem_sys_allocate(sizeof(To_converter));

+ left->to_converters[nc].to = rhs;
+ left->to_converters[nc].func = func;
+ }
+ }
+}
+
/*
* Local variables:
* c-indentation-style: bsd 1.238 +36 -1 parrot/src/string.c Index: string.c ===================================================================
RCS file: /cvs/public/parrot/src/string.c,v retrieving revision 1.237
retrieving revision 1.238
diff -u -r1.237 -r1.238
--- string.c 28 Feb 2005 18:01:28 -0000 1.237
+++ string.c 1 Mar 2005 14:19:48 -0000 1.238
@@no-spam -1,6 +1,6 @@no-spam /*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.237 2005/02/28 18:01:28 leo Exp $
+$Id: string.c,v 1.238 2005/03/01 14:19:48 leo Exp $
=head1 NAME @@no-spam -2591,6 +2591,41 @@no-spam return CHARSET_FIND_WORD_BOUNDARY(interpreter, s, offset);
}
+STRING*
+Parrot_string_trans_charset(Interp *interpreter, STRING *src,
+ INTVAL charset_nr, STRING *dest)
+{
+ CHARSET *new_charset;
+
+ if (!src)
+ return NULL;
+ new_charset = Parrot_get_charset(interpreter, charset_nr);
+ if (!new_charset)
+ real_exception(interpreter, NULL, INVALID_CHARTYPE,
+ "charset #%d not found", (int) charset_nr);
+ /*
+ * dest is an empty string header or NULL, if an inplace + * operation is desired + */
+ if (dest) {
+ if (new_charset == src->charset) {
+ Parrot_reuse_COW_reference(interpreter, src, dest);
+ dest->charset = new_charset;
+ /* keep encoding */
+ return dest;
+ }
+ dest->charset = new_charset;
+ /* XXX prefered encoding for charset */
+ dest->encoding = PARROT_DEFAULT_ENCODING;
+ }
+ else {
+ if (new_charset == src->charset) {
+ return src;
+ }
+ }
+ return CHARSET_TO_CHARSET(interpreter, src, new_charset, dest);
+}
+
/*
=back 1.6 +62 -2 parrot/t/op/string_cs.t Index: string_cs.t ===================================================================
RCS file: /cvs/public/parrot/t/op/string_cs.t,v retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- string_cs.t 28 Feb 2005 18:01:30 -0000 1.5
+++ string_cs.t 1 Mar 2005 14:19:49 -0000 1.6
@@no-spam -1,6 +1,6 @@no-spam #! perl -w # Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: string_cs.t,v 1.5 2005/02/28 18:01:30 leo Exp $
+# $Id: string_cs.t,v 1.6 2005/03/01 14:19:49 leo Exp $
=head1 NAME @@no-spam -16,7 +16,7 @@no-spam =cut -use Parrot::Test tests => 16;
+use Parrot::Test tests => 20;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@no-spam -245,3 +245,63 @@no-spam CODE 0 2 3 6 -1 ok OUTPUT +
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i");
+ set S0, "abc"
+ find_charset I0, "ascii"
+ trans_charset S1, S0, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end +CODE +abc +ascii +OUTPUT +
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i");
+ set S1, "abc"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end +CODE +abc +ascii +OUTPUT +
+
+output_like( <<'CODE', <<OUTPUT, "trans_charset_s_i - lossy");
+ set S1, "abcä"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print "never\n"
+ end +CODE +/lossy conversion to ascii/
+OUTPUT +
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i - same");
+ set S1, ascii:"abc"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end +CODE +abc +ascii +OUTPUT +