Parser parser
=
new
Parser();
parser.setURL(address);
NodeFilter tableFilter
=
new
NodeClassFilter(TableTag.
class
);
NodeList list
=
parser.extractAllNodesThatMatch(tableFilter);
String test
=
new
String(list.toHtml().getBytes(
"
iso8859-1
"
),
"
gb2312
"
);
System.out.println(test);
这是我写的一段抓网页的程序, 开始抓回来的数据在ie显示正常, 而在firefox为乱码. 在eclipse里打印也是乱码. 后来将字符串以iso8859-1 转到gb2312, 一切正常.
牢记: get和post方式发送请求, 默认就是iso8859-1. 只不过ie隐含的将之转换而已.
iso8859-1 我称为结果编码. gb2312称为显示编码. 怎么找这两个编码?
1>get和post默认都是iso8859-1, 所以结果编码一般为iso8859-1. 但如果是ajax请求, 则一般为utf-8. 所以可以在两者之间试试.
2>显示编码在该网页就能抄到.要不网页也不能正确显示中文.
也可以加上:
parser.setEncoding("gb2312");省了手动转码的工夫.
//多行记录合并.
public List generateDataList(HttpServletRequest request) {
List systemOwners = new ArrayList();
List dataList = new ArrayList();
try {
systemOwners = (List)Manager.GetInstanceSystemOwner(null);
List entityList = getEntityList();
List regionList = getRegionList();
RoleActeur actor;
Map result =null;
String instanceId;
String instanceLabel;
String scopeId;
String scope;
String scopeLabel;
String key=null;
String lastKey=null;
String validatorStr=null;
String sep=",";
for(Iterator it = systemOwners.iterator();it.hasNext();){
actor = (RoleActeur)it.next();
instanceId = actor.getEntiteOrganisationnelle().toString();
instanceLabel = actor.getInstanceName();
scopeId = actor.getScopeId().toString();
scope = actor.getScope();
scopeLabel = getEntityOrRegionName(scope,scopeId,entityList,regionList);
key = instanceId+"_"+scopeId+"_"+scopeLabel;
if(!key.equals(lastKey)){
if(result!=null){
dataList.add(result);
}
result = new HashMap();
result.put("IAP_ID",instanceId);
result.put("IAP_LIBELLE",instanceLabel);
result.put("SCOPE_ID",scopeId);
result.put("SCOPE_LABEL",scopeLabel);
lastKey = key;
}
concateString(result,"INSTANCE_IDS",actor.getIdActeur(),sep);
}
if(result!=null){
//Add the last one.
dataList.add(result);
}
}catch (Exception e) {
logger.error(e);
}
return dataList;
}
曾经想过写个接口. 后来想想很难包含其他业务. 干脆先把代码记录下来.