/** * Returns true if a character is a control character, and * false otherwise. * * @param c * @return */ public static boolean isControl(char c) { return Character.getType(c) == Character.CONTROL; } }
/** * Returns true if a character is a symbol, and false * otherwise. * * @param c * @return */ public static boolean isSymbol(char c) { int cType = Character.getType(c); return cType == Character.MATH_SYMBOL || cType == Character.CURRENCY_SYMBOL || cType == Character.MODIFIER_SYMBOL || cType == Character.OTHER_SYMBOL; }
/** * Returns true if a character is punctuation, and false * otherwise. * * @param c * @return */ public static boolean isPunctuation(char c) { int cType = Character.getType(c); return cType == Character.START_PUNCTUATION || cType == Character.END_PUNCTUATION || cType == Character.OTHER_PUNCTUATION || cType == Character.CONNECTOR_PUNCTUATION || cType == Character.DASH_PUNCTUATION || cType == Character.INITIAL_QUOTE_PUNCTUATION || cType == Character.FINAL_QUOTE_PUNCTUATION; }
/** * Gets the general Unicode category of the specified character. * * @param c * the character to get the category of. * @return the Unicode category of {@code c}. */ public static int getType(char c) { return getType((int) c); }
public static boolean isPrintableUnicode(char c) { int t = Character.getType(c); return t != Character.UNASSIGNED && t != Character.LINE_SEPARATOR && t != Character.PARAGRAPH_SEPARATOR && t != Character.CONTROL && t != Character.FORMAT && t != Character.PRIVATE_USE && t != Character.SURROGATE; }
@Override public boolean contains(int ch) { return alt ^ (category == Character.getType((char) ch)); } }
@Override public boolean contains(int ch) { return alt ^ ((category >> Character.getType((char) ch)) & 1) != 0; } }
public static boolean isInvisibleChar(int c) { return Character.getType(c) == 16 && (c == 8203 || c == 8204 || c == 8205 || c == 173); // zero width sp, zw non join, zw join, soft hyphen }
/** * Classify a character into 5 categories that determine the word break. */ protected int classify(char c0) { switch(Character.getType(c0)) { case Character.UPPERCASE_LETTER: return UPPER_LETTER; case Character.LOWERCASE_LETTER: return LOWER_LETTER; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: return OTHER_LETTER; case Character.DECIMAL_DIGIT_NUMBER: return DIGIT; default: return OTHER; } }
public static boolean isLetterOrDigitOrPunc(char ch) { int i = Character.getType(ch); return Character.isLowerCase(ch) || Character.isUpperCase(ch) || Character.isDigit(ch) || (i>=20&&i<=30); //TODO:可以修改为Java7中的isLetterOrDigit }
System.out.println(Character.DECIMAL_DIGIT_NUMBER == Character.getType('१'));
public static void forbidUnassigned(int input) { if (Character.getType(input) == Character.UNASSIGNED) { throw log.unassignedCodePoint(input); } }
/** * Generate next code point that is valid for composition of a string. * Additional limitation on code point range is given by method argument. * * @param maxCodePoint the maximum code point to consider * @return A pseudorandom valid code point */ private int nextValidCodePoint( int maxCodePoint ) { int codePoint; int type; do { codePoint = intBetween( Character.MIN_CODE_POINT, maxCodePoint ); type = Character.getType( codePoint ); } while ( type == Character.UNASSIGNED || type == Character.PRIVATE_USE || type == Character.SURROGATE ); return codePoint; }
/** * Gets the Unicode directionality of the specified character. * * @param codePoint * the Unicode code point to get the directionality of. * @return the Unicode directionality of {@code codePoint}. */ public static byte getDirectionality(int codePoint) { if (getType(codePoint) == Character.UNASSIGNED) { return Character.DIRECTIONALITY_UNDEFINED; } byte directionality = getDirectionalityImpl(codePoint); if (directionality == -1) { return -1; } return DIRECTIONALITY[directionality]; }
private static boolean isSbeCSharpIdentifierPart(final char c) { if (isSbeCSharpIdentifierStart(c)) { return true; } switch (Character.getType(c)) { case Character.NON_SPACING_MARK: // Mn case Character.COMBINING_SPACING_MARK: // Mc case Character.DECIMAL_DIGIT_NUMBER: // Nd case Character.CONNECTOR_PUNCTUATION: // Pc case Character.FORMAT: // Cf return true; default: return false; } } }
public static String getRandUnicodeString(Random r, int length) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < length; i++) { char ch; while (true) { int codePoint = MIN_RANDOM_CODEPOINT + r.nextInt(RANGE_RANDOM_CODEPOINT); if (!Character.isDefined(codePoint) || Character.getType(codePoint) == Character.PRIVATE_USE) { continue; } ch = (char) codePoint; if (Character.isSurrogate(ch)) { continue; } break; } sb.append(ch); } return sb.toString(); }
@Description("decodes the UTF-8 encoded string") @ScalarFunction @SqlType(StandardTypes.VARCHAR) public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType(StandardTypes.BIGINT) long replacementCodePoint) { if (replacementCodePoint > MAX_CODE_POINT || Character.getType((int) replacementCodePoint) == SURROGATE) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character"); } return SliceUtf8.fixInvalidUtf8(slice, OptionalInt.of((int) replacementCodePoint)); }
private synchronized void processRawFrame(OutputFrame outputFrame, Consumer<OutputFrame> consumer) { String utf8String = outputFrame.getUtf8String(); byte[] bytes = outputFrame.getBytes(); // Merging the strings by bytes to solve the problem breaking non-latin unicode symbols. if (brokenFrame != null) { bytes = merge(brokenFrame.getBytes(), bytes); utf8String = new String(bytes); brokenFrame = null; } // Logger chunks can break the string in middle of multibyte unicode character. // Backup the bytes to reconstruct proper char sequence with bytes from next frame. int lastCharacterType = Character.getType(utf8String.charAt(utf8String.length() - 1)); if (lastCharacterType == Character.OTHER_SYMBOL) { brokenFrame = new OutputFrame(outputFrame.getType(), bytes); return; } utf8String = processAnsiColorCodes(utf8String, consumer); normalizeLogLines(utf8String, consumer); }