[libromdata] Xbox360_STFS: Fix titles for some packages that were authored incorrectly and have mojibake titles.

Two types of mojibake are detected and fixed:
- UTF-8 parsed as cp1252
- Shift-JIS parsed as cp1252

For the latter case, utf16_to_cp1252() on Linux (and other systems that
use iconv) has been modified to handle five code points that iconv()
doesn't support: 0x81, 0x8D, 0x8F, 0x90, 0x9D.

TODO: Also do this for utf8_to_cp1252() and other cp1252 functions?

On Windows, WideCharToMultiByte() and MultiByteToWideChar() handle these
code points without any issues.

These three title updates now show a correct title:

TU_11LK1UV_0000004000000.0000000000081 (UTF-8 as cp1252)
Name:  'Dodonpachi Daifukkatsu Black Label Title Update #1'
Title: '怒首領蜂 大復活 Black Label'

TU_12501VG_0000004000000.0000000000101 (UTF-8 as cp1252)
Name:  'DREAM C CLUB Title Update #4'
Title: 'ドリームクラブ'

TU_15LG1UH_000000C000000.0000000000083 (Shift-JIS as cp1252)
Name:  'Circle of Students Title Update #1'
Title: '円卓の生徒'

Fixes #450: X360 - Non-Latin Titles appearing as mojibake
Reported by @Masamune3210.
This commit is contained in:
David Korth 2025-05-24 15:02:21 -04:00
parent 34f76050cb
commit 5de651dbbe
4 changed files with 191 additions and 6 deletions

View File

@ -2,11 +2,19 @@
## v2.6 (released 2025/??/??)
* New parser features:
* Xbox360_STFS: Fix titles for some packages that were authored incorrectly
and have mojibake titles. This includes cases where UTF-8 was parsed as
cp1252, and where Shift-JIS was parsed as cp1252.
* Fixes #450: X360 - Non-Latin Titles appearing as mojibake
* Reported by @Masamune3210.
* Bug fixes:
* Windows: Work around a potential libpng crash when attempting to read
empty data as a PNG image. (Needs more debugging for a proper fix...)
* See #451: libpng errors crash due to libpng setjmp/longjmp (Windows 10, release builds only)
* Reported by @Masamune3210.
* Other changes:
* rpcli: Added more colorization for warning messages.
* rpcli: Refactored console handling into a separate library, libgsvt.

View File

@ -31,6 +31,7 @@ using namespace LibRpTexture;
using std::array;
using std::shared_ptr;
using std::string;
using std::u16string;
using std::unique_ptr;
using std::vector;
@ -151,6 +152,18 @@ public:
* @return Default executable on success; nullptr on error.
*/
Xbox360_XEX *openDefaultXex(void);
public:
/**
* Get the title.
*
* Encoded as UTF-16BE, but some titles were incorrectly converted
* from cp1252 when they should have been converted from Shift-JIS,
* so this function has a heuristic to detect and fix this.
*
* @return Title
*/
string getTitle(void) const;
};
ROMDATA_IMPL(Xbox360_STFS)
@ -560,6 +573,66 @@ Xbox360_XEX *Xbox360_STFS_Private::openDefaultXex(void)
return this->xex.get();
}
/**
* Get the title.
*
* The text was incorrectly converted from cp1252 to UTF-16BE, when the original
* text was actually encoded as UTF-8. This function uses a heuristic to detect
* this incorrect conversion and fix it.
*
* @return Title
*/
string Xbox360_STFS_Private::getTitle(void) const
{
// Check for common mojibake characters.
static constexpr array<char16_t, 5> mojibake_chars = {{
0x0192, // 'ƒ': LATIN SMALL LETTER F WITH HOOK
0x00E6, // 'æ': LATIN SMALL LETTER AE
0x20AC, // '€': EURO SIGN
0x2020, // '†': DAGGER
}};
// TODO: Also check for Japanese characters.
// If found, this is *not* an incorrect conversion.
bool isMojibake = false;
bool isMojibakeSJIS = false;
for (const char16_t *p = stfsMetadata.title_name; *p != 0; p++) {
// FIXME: Use proper byteswapping as the cases instead of using be16_to_cpu().
const char16_t c = be16_to_cpu(*p);
switch (c) {
case 0x0192: // 'ƒ': LATIN SMALL LETTER F WITH HOOK
case 0x00E6: // 'æ': LATIN SMALL LETTER AE
case 0x20AC: // '€': EURO SIGN
case 0x2020: // '†': DAGGER
// Likely mojibake
isMojibake = true;
break;
case 0x0081:
// Likely mojibake, with Shift-JIS encoding instead of UTF-8.
isMojibake = true;
isMojibakeSJIS = true;
break;
default:
break;
}
}
if (likely(!isMojibake)) {
return utf16be_to_utf8(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
}
// Convert to cp1252. This will actually be either UTF-8 or Shift-JIS.
#if SYS_BYTEORDER == SYS_LIL_ENDIAN
u16string str_utf16 = utf16be_to_utf16(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
string str_cp1252 = utf16_to_cp1252(str_utf16.data(), static_cast<int>(str_utf16.size()));
return utf16_to_cp1252(str_utf16.data(), static_cast<int>(str_utf16.size()));
#else /* SYS_BYTEORDER == SYS_BIG_ENDIAN */
return utf16_to_cp1252(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
#endif
}
/** Xbox360_STFS **/
/**
@ -957,9 +1030,7 @@ int Xbox360_STFS::loadFieldData(void)
// Title
if (stfsMetadata->title_name[0] != 0) {
d->fields.addField_string(C_("RomData", "Title"),
utf16be_to_utf8(stfsMetadata->title_name,
ARRAY_SIZE(stfsMetadata->title_name)));
d->fields.addField_string(C_("RomData", "Title"), d->getTitle());
}
// File type

View File

@ -317,18 +317,52 @@ static inline std::string utf8_to_cp1252(const char *str, int len)
return utf8_to_cpN(1252, str, len);
}
#ifdef _WIN32
/**
* Convert UTF-16 text to cp1252.
* Trailing NULL bytes will be removed.
* Invalid characters will be ignored.
* @param wcs [in] UTF-16 text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
*
* NOTE: On Windows, WideCharToMultiByte() handles "invalid" cp1252 characters.
* We don't need to handle it ourselves.
*
* @param wcs [in] UTF-16 text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @return cp1252 string.
*/
static inline std::string utf16_to_cp1252(const char16_t *wcs, int len)
{
return utf16_to_cpN(1252, wcs, len);
}
#else /* !_WIN32 */
/**
* Convert UTF-16 text to cp1252.
* Trailing NULL bytes will be removed.
*
* NOTE: On non-Windows systems, iconv() does *not* handle "invalid" cp1252 characters.
* This function preprocesses the string in order to process those characters.
* This is needed in order to handle Xbox 360 "mojibake" encoding.
*
* @param wcs [in] UTF-16 text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @return cp1252 string.
*/
std::string utf16_to_cp1252(const char16_t *wcs, int len);
#endif /* _WIN32 */
/**
* Convert UTF-16 text to cp1252.
* Trailing NULL bytes will be removed.
*
* NOTE: Specialized function that does *not* ignore "invalid" cp1252 characters.
* This is needed in order to handle Xbox 360 "mojibake" encoding.
*
* @param wcs [in] UTF-16 string
* @return cp1252 string.
*/
static inline std::string utf16_to_cp1252(const std::u16string &wcs)
{
return utf16_to_cp1252(wcs.data(), static_cast<int>(wcs.size()));
}
/* Shift-JIS (cp932) with cp1252 fallback */

View File

@ -41,8 +41,10 @@ static const char RP_ICONV_UTF16_ENCODING[] = "UTF-16LE";
#include <cassert>
// C++ STL classes
#include <vector>
using std::string;
using std::u16string;
using std::vector;
namespace LibRpText {
@ -451,4 +453,74 @@ string utf16be_to_utf8(const char16_t *wcs, int len)
return INT_utf16_to_utf8("UTF-16BE", wcs, len);
}
/**
* Convert UTF-16 text to cp1252.
* Trailing NULL bytes will be removed.
*
* NOTE: On non-Windows systems, iconv() does *not* handle "invalid" cp1252 characters.
* This function preprocesses the string in order to process those characters.
* This is needed in order to handle Xbox 360 "mojibake" encoding.
*
* @param wcs [in] UTF-16 text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @return cp1252 string.
*/
std::string utf16_to_cp1252(const char16_t *wcs, int len)
{
len = check_NULL_terminator(wcs, len);
// Find any "invalid" cp1252 characters.
// Character indexes in wcs are stored, which should map directly
// to character indexes in the resulting string.
// FIXME: This may break if non-BMP characters are present.
vector<int> char_idx;
for (int i = 0; i < len; i++) {
switch (wcs[i]) {
default:
break;
case 0x0081:
case 0x008D:
case 0x008F:
case 0x0090:
case 0x009D:
// Invalid cp1252 character.
char_idx.push_back(i);
break;
}
}
if (char_idx.empty()) {
// No invalid characters. Convert it directly.
char *mbs = reinterpret_cast<char*>(rp_iconv((char*)wcs, len*sizeof(*wcs), RP_ICONV_UTF16_ENCODING, "CP1252", false));
if (!mbs) {
// Conversion failed...
return {};
}
return string(mbs);
}
// Convert using "//TRANSLIT", then manually replace the bad characters.
char *mbs = reinterpret_cast<char*>(rp_iconv((char*)wcs, len*sizeof(*wcs), RP_ICONV_UTF16_ENCODING, "CP1252//TRANSLIT", false));
if (!mbs) {
// Conversion failed...
return {};
}
string str(mbs);
const int str_size = static_cast<int>(str.size());
for (int idx : char_idx) {
assert(idx < str_size);
if (idx >= str_size) {
// Invalid index?
return {};
}
str[idx] = static_cast<char>(static_cast<uint8_t>(wcs[idx]));
}
return str;
}
}