https://www.jianshu.com/p/afee9acba686
数据文件为文本文件,每一行为固定格式,每一列的长度都是定长或是有限制范围,考虑采用hive提供的RegexSerDe来实现记录解析,使用后发现hive查询出的数据中文字段乱码
Hadoop中文件默认utf8编码,hive序列化操作时,默认按照utf8来解析,所以肯定会乱码,从网上查了下,解决方案是建表是指定serde的"serialization.encoding"="GBK",然而并没有解决我的问题
Hive建表格式为ROW FORMAT,不指定SerDe时,默认用的是org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,继承了org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe,而该类确实可以通过设置"serialization.encoding"="GBK"来解决hive读取gbk文件乱码的问题,代码如下:
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by Fernflower decompiler)
//
package org.apache.hadoop.hive.serde2;
import com.google.common.base.Charsets;
import java.nio.charset.Charset;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class AbstractEncodingAwareSerDe extends AbstractSerDe {
private static final Logger LOG = LoggerFactory.getLogger(AbstractEncodingAwareSerDe.class);
protected Charset charset;
public AbstractEncodingAwareSerDe() {
}
/** @deprecated */
@Deprecated
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
this.charset = Charset.forName(tbl.getProperty("serialization.encoding", "UTF-8"));
if (this.charset.equals(Charsets.ISO_8859_1) || this.charset.equals(Charsets.US_ASCII)) {
LOG.warn("The data may not be properly converted to target charset " + this.charset);
}
}
public final Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
Writable result = this.doSerialize(obj, objInspector);
if (!this.charset.equals(Charsets.UTF_8)) {
result = this.transformFromUTF8(result);
}
return result;
}
protected abstract Writable transformFromUTF8(Writable var1);
protected abstract Writable doSerialize(Object var1, ObjectInspector var2) throws SerDeException;
public final Object deserialize(Writable blob) throws SerDeException {
if (!this.charset.equals(Charsets.UTF_8)) {
blob = this.transformToUTF8(blob);
}
return this.doDeserialize(blob);
}
protected abstract Writable transformToUTF8(Writable var1);
protected abstract Object doDeserialize(Writable var1) throws SerDeException;
}