The env_preferences
crate is being developed under ICU4X by a GSOC student, so it makes sense to switch over. The implementation is probably very flaky, but should work for now.
UN2XEIEUIB4ERS3IXOHQT2GCPBKK3JKHCGEVKQFP4SCV5AONFXMQC
HL6ZEJTNQGFQDFI6MJHKITGZHCXC3OYC2O7KRD42D36PEU5C5YVAC
BXLE3JXY37S6M7DGPM4KCL3ABT64HUVJ3NEUOFK56IFDOMUEJ3SQC
CFJKYXUX4FF2DVAOJ3RRTI4JZPP5GMMDTJCEYM2IS57SCRKGZI6AC
C6W7N6N57UCNHEV55HEZ3G7WN2ZOBGMFBB5M5ZPDB2HNNHHTOPBQC
KF65O6ODA2UE2GYYTXFINCJW54CN62LB65NQLZNI5UM2W76ABEJAC
KDUI7LHJRRQRFYPY7ANUNXG6XCUKQ4YYOEL5NG5Y6BRMV6GQ5M7AC
YNEOCYMGMSHQGCL5TOIGWDDKHE4BZ5M7FGY5I6B2V6JO6ZRCLETAC
WBI5HFOBBUMDSGKY2RX3YA6N7YDCJEP23JNEJ7PG5VZXHLYIRJRQC
HCGVXOF7P3KKS2IMGVJWI2POVOZQFPXH26YVBJZRSOYSUM4CHUBQC
BFL2Y7GN6NBXXNAUSD4M6T6CIVQ2OLERPE2CAFSLRF377WFFTVCQC
JZXXFWQKOYAFQLQZDRALXG4KGEDR7JKO3AZ5Q5X7IQTS7BCJP3QAC
LIH6JCXY5GMYQPU5L6HY2NOMJDMEW54THPKJ6YXI62Y2SVXFIAXQC
T6JEWQJ7KI4SQFGIZNRKCWD5DEUVTIPEWXU7AX6WM7IU4DBSQZRQC
HHJDRLLNN36UNIA7STAXEEVBCEMPJNB7SJQOS3TJLLYN4AEZ4MHQC
3NMKD6I57ONAGHEN4PZIAV2KPYESVR4JL3DTWSHXKCMVJBEQ4GIQC
VZYZRAO4EXCHW2LBVFG5ELSWG5SCNDREMJ6RKQ4EKQGI2T7SD3ZQC
KZLFC7OWYNK3G5YNHRANUK3VUVCM6W6J34N7UABYA24XMZWAVVHQC
F5LG7WENUUDRSCTDMA4M6BAC5RWTGQO45C4ZEBZDX6FHCTTHBVGQC
UKFEFT6LSI4K7X6UHQFZYD52DILKXMZMYSO2UYS2FCHNPXIF4BEQC
VNSHGQYNPGKGGPYNVP4Z2RWD7JCSDJVYAADD6UXWBYL6ZRXKLE4AC
SHNZZSZGIBTTD4IV5SMW5BIN5DORUWQVTVTNB5RMRD5CTFNOMJ6AC
6ABVDTXZOHVUDZDKDQS256F74LFIMM5DO3OZWHKRXZBUTPII4WAQC
O77KA6C4UJGZXVGPEA7WCRQH6XYQJPWETSPDXI3VOKOSRQND7JEQC
BANMRGROVYKYRJ4N2P4HSOJ2JVV6VSEB3W34BFXPOEFND5O36CGAC
[package]
name = "locale_select"
version = "0.1.0"
edition = "2021"
[dependencies]
fluent-langneg = { version = "0.14.0", features = ["cldr"] }
icu_locid = "1.5.0"
libc = "0.2.153"
[dev-dependencies]
gettext-rs = "0.7.0"
[lints]
workspace = true
pub mod unix;
pub fn match_locales(
available: &[LanguageIdentifier],
default: &LanguageIdentifier,
) -> LanguageIdentifier {
// TODO: requesting locales should have platform-specific logic
let requested = unix::get_locales(fetch::unix::LocaleCategory::LC_MESSAGES);
let supported = fluent_langneg::negotiate_languages(
&requested,
&available,
Some(&default),
NegotiationStrategy::Matching,
);
// TODO: properly handle this case
if let [single_locale] = supported[..] {
// TODO: this is wasteful but avoids dealing with lifetimes for now
single_locale.to_owned()
} else {
todo!("Multiple locales returned, which is not yet handled. Got: {supported:#?}");
}
}
// TODO: this can probably be enums, not strings
use fluent_langneg::NegotiationStrategy;
use icu_locid::LanguageIdentifier;
pub mod fetch;
#[derive(Debug, Clone, Copy)]
enum OptionalSubtagType {
Territory,
Codeset,
Modifier,
}
impl OptionalSubtagType {
const fn try_from_char(source: char) -> Option<Self> {
match source {
'_' => Some(Self::Territory),
'.' => Some(Self::Codeset),
'@' => Some(Self::Modifier),
_ => None,
}
}
}
#[derive(Debug, Clone, Copy)]
struct SubtagIndex {
separator: OptionalSubtagType,
separator_index: usize,
}
impl SubtagIndex {
const fn try_from_char(captures: (usize, char)) -> Option<Self> {
// Closure captures a tuple of length 2, destructure for readability
let (index, source) = captures;
if let Some(separator) = OptionalSubtagType::try_from_char(source) {
Some(Self {
separator,
separator_index: index,
})
} else {
None
}
}
fn from_str_with_offset(source: &str, index_offset: usize) -> Option<Self> {
source
.chars()
.enumerate()
.skip(index_offset)
.find_map(Self::try_from_char)
}
}
#[derive(Debug, Clone, Copy)]
struct OptionalSubtag {
start_index: usize,
end_index: usize,
subtag_type: OptionalSubtagType,
}
#[derive(Debug, Clone, Copy)]
struct OptionalSubtagsIterator<'locale> {
source: &'locale str,
current_subtag: Option<SubtagIndex>,
next_subtag: Option<SubtagIndex>,
}
impl<'locale> OptionalSubtagsIterator<'locale> {
fn new(source: &'locale str) -> Self {
let current_subtag = if let Some(first_character) = source.chars().next() {
let subtag = SubtagIndex::try_from_char((0, first_character)).expect(&format!(
"The first character in `{source}` ('{first_character}') is not a valid separator."
));
Some(subtag)
} else {
// The source locale is empty, return an empty iterator
None
};
Self {
source,
current_subtag,
next_subtag: SubtagIndex::from_str_with_offset(&source, 1),
}
}
impl<'locale> Iterator for OptionalSubtagsIterator<'locale> {
type Item = OptionalSubtag;
fn next(&mut self) -> Option<Self::Item> {
// If the current subtag is empty, all work is done
let current_subtag = self.current_subtag.take()?;
let next_subtag = self.next_subtag.take();
// Get the index of the next separator
// If this is the last subtag then this is the length of the source
let next_separator = next_subtag
.as_ref()
.map(|next_index| next_index.separator_index)
.unwrap_or(self.source.len());
// Modify internal state for the next iteration
self.current_subtag = next_subtag;
self.next_subtag = if next_separator < self.source.len() {
SubtagIndex::from_str_with_offset(&self.source, next_separator + 1)
} else {
None
};
Some(OptionalSubtag {
start_index: current_subtag.separator_index + 1,
end_index: next_separator,
subtag_type: current_subtag.separator,
})
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
pub struct PosixLocale<'locale> {
pub language: &'locale str,
pub territory: Option<&'locale str>,
pub codeset: Option<&'locale str>,
pub modifier: Option<&'locale str>,
}
impl<'locale> PosixLocale<'locale> {
pub fn from_str(source: &'locale str) -> Self {
let additional_subtags_start = source
.chars()
.position(|character| OptionalSubtagType::try_from_char(character).is_some());
let language_end_bound = additional_subtags_start.unwrap_or(source.len());
let mut locale = PosixLocale {
language: &source[..language_end_bound],
..Default::default()
};
assert!(!locale.language.is_empty());
let mut subtags_iter = OptionalSubtagsIterator::new(&source[language_end_bound..]);
while let Some(subtag) = subtags_iter.next() {
let OptionalSubtag {
start_index,
end_index,
..
} = subtag;
// Offset based on language boundary
let start_index = start_index + language_end_bound;
let end_index = end_index + language_end_bound;
assert!(start_index <= source.len());
assert!(end_index <= source.len());
let subtag_slice = &source[start_index..end_index];
match subtag.subtag_type {
OptionalSubtagType::Territory => locale.territory = Some(subtag_slice),
OptionalSubtagType::Codeset => locale.codeset = Some(subtag_slice),
OptionalSubtagType::Modifier => locale.modifier = Some(subtag_slice),
};
}
locale
}
}
pub fn icu_locale(&self) -> Result<Locale, icu_locid::ParserError> {
let language = subtags::Language::try_from_bytes(self.language.as_bytes())?;
let region = if let Some(territory) = self.territory {
Some(subtags::Region::try_from_bytes(territory.as_bytes())?)
} else {
None
};
// TODO: should script/variants always be empty?
let language_id = LanguageIdentifier {
language,
script: None,
region,
variants: Variants::new(),
};
// TODO: should attributes always be empty?
let unicode_extensions = Unicode {
keywords: POSIX_KEYWORD,
attributes: Attributes::new(),
};
let extensions = Extensions::from_unicode(unicode_extensions);
Ok(Locale {
id: language_id,
extensions,
})
}
}
pub fn get_locales(category: LocaleCategory) -> Vec<LanguageIdentifier> {
category
.get_locales_custom()
.iter()
.map(|locale_str| PosixLocale::from_str(locale_str))
.map(|posix_locale| posix_locale.icu_locale())
.filter_map(|potential_locale| potential_locale.ok())
// TODO: is it ok to strip this posix metadata from the locale?
.map(|locale| locale.id)
.collect()
}
use icu_locid::extensions::unicode::{key, value, Attributes, Keywords, Unicode};
use icu_locid::extensions::Extensions;
use icu_locid::subtags::{self, Variants};
use icu_locid::{LanguageIdentifier, Locale};
const POSIX_KEYWORD: Keywords = Keywords::new_single(key!("va"), value!("posix"));
use crate::fetch::unix::LocaleCategory;
use std::env;
use std::ffi::CStr;
const NUL_BYTE: &[u8] = b"\0";
macro_rules! repr_lc {
($($variant:ident),+) => {
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)] // Required for parity with C enum
pub enum LocaleCategory {
$($variant,)*
}
impl TryFrom<i32> for LocaleCategory {
type Error = ();
fn try_from(value: i32) -> Result<Self, Self::Error> {
match value {
$(libc::$variant => Ok(Self::$variant),)*
_ => Err(())
}
}
}
impl Into<i32> for LocaleCategory {
fn into(self) -> i32 {
match self {
$(Self::$variant => libc::$variant,)*
}
}
}
impl LocaleCategory {
fn as_str(&self) -> &str {
match self {
$(Self::$variant => stringify!($variant),)*
}
}
}
}
}
repr_lc! {
LC_ALL,
LC_CTYPE,
LC_COLLATE,
LC_MESSAGES,
LC_MONETARY,
LC_NUMERIC,
LC_TIME,
LC_ADDRESS,
LC_IDENTIFICATION,
LC_MEASUREMENT,
LC_NAME,
LC_PAPER,
LC_TELEPHONE
}
// TODO: handle and document safety invariants
fn get_locale_libc(category: i32) -> String {
let empty_cstr = CStr::from_bytes_with_nul(NUL_BYTE).unwrap();
let locale_string_pointer = unsafe { libc::setlocale(category, empty_cstr.as_ptr()) };
let locale_c_str = unsafe { CStr::from_ptr(locale_string_pointer) };
locale_c_str.to_str().unwrap().to_string()
}
impl LocaleCategory {
/// Query the locale following the POSIX spec:
/// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_02
///
/// Order of precedence:
/// 1. LC_ALL
/// 2. LC_{NUMERIC, TIME, etc}
/// 3. LANG
/// 4. Default locale (handled by caller, this function will return None)
pub fn get_locales_custom(&self) -> Vec<String> {
let mut locales = Vec::with_capacity(3);
if let Ok(global_locale) = env::var("LC_ALL") {
locales.push(global_locale);
}
if let Ok(category_locale) = env::var(self.as_str()) {
locales.push(category_locale);
}
}
}
if let Ok(lang) = env::var("LANG") {
locales.push(lang);
}
locales
}
pub fn get_locales_libc(self) -> Vec<String> {
let global_locale = get_locale_libc(libc::LC_ALL);
let locale_for_category = get_locale_libc(self.into());
vec![global_locale, locale_for_category]
use std::collections::HashSet;
use gettextrs::LocaleCategory as GettextCategory;
use locale_select::fetch::unix::LocaleCategory as LocaleSelectCategory;
use locale_select::unix::PosixLocale;
const GETTEXT_CATEGORIES: [GettextCategory; 13] = [
GettextCategory::LcAll,
GettextCategory::LcCType,
GettextCategory::LcCollate,
GettextCategory::LcMessages,
GettextCategory::LcMonetary,
GettextCategory::LcNumeric,
GettextCategory::LcTime,
GettextCategory::LcAddress,
GettextCategory::LcIdentification,
GettextCategory::LcMeasurement,
GettextCategory::LcName,
GettextCategory::LcPaper,
GettextCategory::LcTelephone,
];
#[test]
fn simple_en_us() {
let locale = "en_US.utf8";
let parsed_locale = PosixLocale::from_str(locale);
assert_eq!(
parsed_locale,
PosixLocale {
language: "en",
territory: Some("US"),
codeset: Some("utf8"),
modifier: None,
}
);
}
#[test]
/// Exactly compare the output of get_locales_libc() with get_locales_custom()
fn compare_libc_with_custom_impl_exact() {
for gettext_category in GETTEXT_CATEGORIES {
let locale_select_category =
LocaleSelectCategory::try_from(gettext_category as i32).unwrap();
let libc_locales = locale_select_category.get_locales_libc();
let custom_locales = locale_select_category.get_locales_custom();
assert_eq!(libc_locales, custom_locales);
}
}
#[test]
/// Compare the output of get_locales_libc() with get_locales_custom() using a HashSet
///
/// This will make sure that both functions return the same data, even if it's not
/// in the same order or items are duplicated. If the `_exact()` variant of this test
/// fails, this test may still pass.
fn compare_libc_with_custom_impl_hash_set() {
for gettext_category in GETTEXT_CATEGORIES {
let locale_select_category =
LocaleSelectCategory::try_from(gettext_category as i32).unwrap();
let libc_locales: HashSet<String> =
HashSet::from_iter(locale_select_category.get_locales_libc().into_iter());
let custom_locales: HashSet<String> =
HashSet::from_iter(locale_select_category.get_locales_custom().into_iter());
assert_eq!(
libc_locales
.symmetric_difference(&custom_locales)
.collect::<Vec<_>>(),
Vec::<&String>::new()
);
}
}
#[test]
/// Compare get_locales_libc() with the implementation from gettext-rs
fn compare_libc_with_gettext() {
for gettext_category in GETTEXT_CATEGORIES {
let locale_select_category =
LocaleSelectCategory::try_from(gettext_category as i32).unwrap();
let libc_locales = locale_select_category.get_locales_libc();
let gettext_locales =
String::from_utf8(gettextrs::setlocale(gettext_category, b"").unwrap()).unwrap();
assert_eq!(libc_locales[0], gettext_locales);
}
}
#[test]
/// Compare get_locales_custom() with the implementation from gettext-rs
fn compare_custom_with_gettext() {
for gettext_category in GETTEXT_CATEGORIES {
let locale_select_category =
LocaleSelectCategory::try_from(gettext_category as i32).unwrap();
let custom_locales = locale_select_category.get_locales_custom();
let gettext_locales =
String::from_utf8(gettextrs::setlocale(gettext_category, b"").unwrap()).unwrap();
assert_eq!(custom_locales[0], gettext_locales);
}
}
assert_eq!(
parsed_locale.icu_locale().unwrap(),
"en-US-u-va-posix".parse().unwrap()
);
# `locale_select`
A simple library for selecting the user's locale preferences for various ICU4X modules, such as calendar, datetime and decimals.
## Alternatives
This library is specific to ICU4X, but there are many Rust APIs with a similar focus. Here are some that I could find, and what they seem to do differently:
- [`simple-locale`](https://github.com/johnstonskj/simple-locale): very close to this crate, but doesn't use ICU4X libraries
- [`locale_settings`](https://docs.rs/locale-settings/latest/locale_settings): unmaintained(?) version of `simple-locale::settings`
- [`sys_locale`](https://github.com/1password/sys-locale): single locale instead of per-category
- [`utf8-locale`](https://gitlab.com/ppentchev/utf8-locale), [`locale-config`](https://github.com/rust-locale/locale_config): categories are less strongly typed
- [`gettextrs::setlocale()`](https://docs.rs/gettext-rs/latest/gettextrs/fn.setlocale.html): `gettext` bindings, this function can query the locale by setting the locale to a null string
## Useful links
- POSIX:
- Locale category data: https://www.man7.org/linux/man-pages/man5/locale.5.html
- Locale category definitions: https://www.man7.org/linux/man-pages/man7/locale.7.html
- Description of locale names: https://www.gnu.org/software/libc/manual/html_node/Locale-Names.html
- Locale spec: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html
- Localization variables spec: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_02
- Unicode handling of POSIX identifiers: https://unicode.org/reports/tr35/tr35.html#Legacy_Variants
let selected_locale = locale_select::match_locales(
let canonical_locale = <Self as Localize<W>>::CANONICAL_LOCALE;
// MacOS and Windows return in order of preference, but Linux returns a HashMap
let requested_locales = get_locales();
let selected_locale = fluent_langneg::negotiate_languages(
&requested_locales,
#[cfg(target_os = "linux")]
// TODO: does not add `-u-va-posix` extension, see https://unicode.org/reports/tr35/tr35.html#Legacy_Variants
pub fn get_locales() -> Vec<LanguageIdentifier> {
let mut retrieved_locales = env_preferences::get_locales().unwrap();
let locale_order = vec![
retrieved_locales.remove(&LocaleCategory::All),
retrieved_locales.remove(&LocaleCategory::Messages),
std::env::var("LANG").ok(),
];
locale_order
.into_iter()
.filter_map(|optional_locale| optional_locale)
.map(|locale| LanguageIdentifier::try_from_bytes(locale.as_bytes()).unwrap())
.collect()
}
#[cfg(any(target_os = "macos", target_os = "windows"))]
pub fn get_locales() -> Vec<LanguageIdentifier> {
env_preferences::get_locales()
.unwrap()
.map(|locale| LanguageIdentifier::try_from_bytes(locale.as_bytes()).unwrap())
.collect()
}
]
[[package]]
name = "gettext-rs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e49ea8a8fad198aaa1f9655a2524b64b70eb06b2f3ff37da407566c93054f364"
dependencies = [
"gettext-sys",
"locale_config",
]
[[package]]
name = "gettext-sys"
version = "0.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c63ce2e00f56a206778276704bbe38564c8695249fdc8f354b4ef71c57c3839d"
dependencies = [
"cc",
"temp-dir",
name = "locale_config"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d2c35b16f4483f6c26f0e4e9550717a2f6575bcd6f12a53ff0c490a94a6934"
dependencies = [
"lazy_static",
"objc",
"objc-foundation",
"regex",
"winapi",
]
[[package]]
name = "locale_select"
version = "0.1.0"
dependencies = [
"fluent-langneg",
"gettext-rs",
"icu_locid",
"libc",
]
[[package]]
name = "malloc_buf"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb"
dependencies = [
"libc",
]
[[package]]
name = "objc"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
dependencies = [
"malloc_buf",
]
[[package]]
name = "objc-foundation"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1add1b659e36c9607c7aab864a76c7a4c2760cd0cd2e120f3fb8b952c7e22bf9"
dependencies = [
"block",
"objc",
"objc_id",
]
[[package]]
name = "objc_id"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c92d4ddb4bd7b50d730c215ff871754d0da6b2178849f8a2a2ab69712d0c073b"
dependencies = [
"objc",
]
[[package]]
name = "winapi"
version = "0.3.9"
name = "windows"
version = "0.56.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1de69df01bdf1ead2f4ac895dc77c9351aefff65b2f3db429a343f9cbf05e132"
dependencies = [
"windows-core",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.56.0"