截止2023年8月10号,使用百度云的内容审核只需要身份认证即可,且可领取免费的测试资源。

应用场景

  • 用户评论过滤:对网站用户的评论信息进行检测,审核出涉及色情、暴恐、恶意推广等内容,保证良好的用户体验
  • 注册信息筛查:对用户的注册信息进行筛查,避免黑产通过用户名实现违规信息的推广
  • 文章内容审核:对UGC文章内容进行多个维度的审核,避免因内容违规导致的APP下架等损失

领取资源

百度云的内容审核各服务均提供一定额度的免费测试资源供测试使用,免费测试资源使用完毕后,可选择付费使用。

各接口免费测试资源可见免费测试资源文档介绍。

进入领取页面,根据情况选择领取接口,左下角0元领取

个人认证:

内容审核平台-文本:一次性赠送50,000次,2 QPS,有效期 365天;

内容审核平台-图像:一次性赠送10,000次,2 QPS,有效期 365天。

创建应用列表

调用百度AI服务需要Access_token,获取Access_token则需要通过应用的 API Key和 Secret Key,我们需要创建一个应用

创建完成之后,我们就得到了 API Key 和 Secret Key

获取AccessToken

鉴权认证的主要目的是获取Access_token。Access_token是用户的访问令牌,承载了用户的身份、权限等信息。

使用我提供的工具类获取:

依赖:

<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
AuthUtil
/**
* 百度云鉴权认证工具
*/
public class AuthUtil {

private static Calendar expireDate;
private static boolean flag = false;
private static String accessToken;

static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();

private static Boolean needAuth() {
Calendar c = Calendar.getInstance();
c.add(Calendar.DATE, 1);
return !flag || c.after(expireDate);
}

/**
* 获取权限token
* @return access_token
*/
public static String getAccessToken(String clientId, String clientSecret) throws Exception {
if (needAuth()) {
flag = true;
JSONObject auth = getAuth(clientId, clientSecret);
accessToken = auth.getString("access_token");
expireDate = Calendar.getInstance();
expireDate.add(Calendar.SECOND, auth.getIntValue("expires_in"));
}
return accessToken;
}

/**
* 从用户的AK,SK生成鉴权签名(Access Token)
*
* @return 鉴权签名(Access Token)
* @throws IOException IO异常
*/
private static JSONObject getAuth(String clientId, String clientSecret) throws Exception {
MediaType mediaType = MediaType.parse("application/json");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(String.format(URLConstants.BAIDU_AUTH_TOKEN, clientId, clientSecret))
.method("POST", body)
.addHeader("Content-Type", "application/json")
.addHeader("Accept", "application/json")
.build();
Response response = HTTP_CLIENT.newCall(request).execute();
if (response.code() != HttpStatus.OK.value()) {
throw new RuntimeException("百度云AccessToken获取失败");
}
return JSONObject.parseObject(response.body().string());
}
}

内容审核服务

内容审核平台-文本
接口地址:https://aip.baidubce.com/rest/2.0/solution/v1/text_censor/v2/user_defined
请求方式:POST
接口名称:内容审核平台-文本

内容审核平台-图像
接口地址:https://aip.baidubce.com/rest/2.0/solution/v1/img_censor/v2/user_defined
请求方式:POST
接口名称:内容审核平台-图像

返回参数说明:

参数名称 数据类型 是否必须 备注
log_id Long Y 请求唯一id
error_code Long N 错误提示码,失败才返回,成功不返回
error_msg String N 错误提示信息,失败才返回,成功不返回
conclusion String N 审核结果,可取值:合规、不合规、疑似、审核失败
conclusionType Integer N 审核结果类型,可取值1.合规,2.不合规,3.疑似,4.审核失败

成功响应案例

{
"log_id": 15556561295920002,
"conclusion": "合规",
"conclusionType": 1
}

失败响应示例

{
"log_id": 149319909347709,
"error_code": 0,
"error_msg":"configId error"
}

代码实现

依赖:

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
ContentCensor
@Getter
@Setter
@Slf4j
@Component
@ConfigurationProperties(prefix = "baiduyun")
public class ContentCensor {

private String clientId;
private String clientSecret;

/**
* 文本审核
*
* @param text 待审核文本
* @return 审核结果 { log_id, conclusion, conclusionType }
* @throws Exception 审核异常
*/
public Map<String, Object> textCensor(String text) throws Exception {

String accessToken = AuthUtil.getAccessToken(clientId, clientSecret);
FormBody formBody = new FormBody.Builder().add("text", text).build();
Request request = new Request.Builder()
.url(String.format(URLConstants.BAIDU_TEXT_CENSOR, accessToken))
.method("POST", formBody)
.addHeader("Content-Type", "application/x-www-form-urlencoded")
.addHeader("Accept", "application/json")
.build();

Response response = AuthUtil.HTTP_CLIENT.newCall(request).execute();
String strBody = response.body().string();
if (response.code() != HttpStatus.OK.value()) {
JSONObject errObj = JSONObject.parseObject(strBody);
log.error("百度文本审核接口调用失败:error_code: {}, error_msg: {}", errObj.getString("error_code"), errObj.getString("error_msg"));
throw new RuntimeException("百度文本审核接口调用失败");
}

JSONObject resultObj = JSONObject.parseObject(strBody);
Map<String, Object> result = new HashMap<>();
result.put("log_id", resultObj.getString("log_id"));
result.put("conclusion", resultObj.getString("conclusion"));
result.put("conclusionType", resultObj.getShortValue("conclusionType"));
return result;
}

/**
* 图像审核
*
* @param imgParams 待审核图像列表 Base64字符串列表
* @return 审核结果 { log_id, conclusion, conclusionType }
* @throws Exception 审核异常
*/
public Map<String, Object> imgCensor(List<String> imgParams) throws Exception {

String accessToken = AuthUtil.getAccessToken(clientId, clientSecret);
Request.Builder builder = new Request.Builder()
.url(String.format(URLConstants.BAIDU_IMG_CENSOR, accessToken))
.addHeader("Content-Type", "application/x-www-form-urlencoded")
.addHeader("Accept", "application/json");

Map<String, Object> result = new HashMap<>();
for (String imgParam : imgParams) {
FormBody formBody = new FormBody.Builder().add("image", imgParam).build();
Request request = builder.method("POST", formBody).build();
Response response = AuthUtil.HTTP_CLIENT.newCall(request).execute();
String strBody = response.body().string();
if (response.code() != HttpStatus.OK.value()) {
JSONObject errObj = JSONObject.parseObject(strBody);
log.error("百度图像审核接口调用失败:error_code: {}, error_msg: {}", errObj.getString("error_code"), errObj.getString("error_msg"));
throw new RuntimeException("百度文本审核接口调用失败");
}
JSONObject resultObj = JSONObject.parseObject(strBody);
Short conclusionType = resultObj.getShort("conclusionType");
// 审核不是通过
if (!Objects.equals(conclusionType, TextCensorResultEnum.COMPLIANCE.type())) {
result.put("log_id", resultObj.getString("log_id"));
result.put("conclusion", resultObj.getString("conclusion"));
result.put("conclusionType", resultObj.getShortValue("conclusionType"));
return result;
}
}
result.put("conclusionType", TextCensorResultEnum.COMPLIANCE.type());
return result;
}
}

常量的定义如下:

URLConstants
public class URLConstants {

public static final String BAIDU_AUTH_TOKEN = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s";
public static final String BAIDU_TEXT_CENSOR = "https://aip.baidubce.com/rest/2.0/solution/v1/text_censor/v2/user_defined?access_token=%s";
public static final String BAIDU_IMG_CENSOR = "https://aip.baidubce.com/rest/2.0/solution/v1/img_censor/v2/user_defined?access_token=%s";
}

枚举类的定义如下:

public enum TextCensorResultEnum {

COMPLIANCE((short) 1, "合规"),
NON_COMPLIANCE((short) 2, "不合规"),
SUSPECTED((short) 3, "疑似"),
AUDIT_FAILED((short) 4, "审核失败");

private final Short type;
private final String desc;

TextCensorResultEnum(Short type, String desc) {
this.type = type;
this.desc = desc;
}

public Short type() {
return type;
}

public String desc() {
return desc;
}
}

自动装配

如果代码放在其他模块中,请使用该部分实现自动装配

resources目录下创建META-INF/spring.factories,内容如下:

spring.factories
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
com.swx.common.baiduyun.ContentCensor

项目集成

以上内容可以放在工具模块或者公共模块中,在服务模块中食用方式如下:

引入依赖(非服务模块):

pom.xml
<dependency>
<groupId>com.swx</groupId>
<artifactId>test-common</artifactId>
</dependency>

配置文件

applicaton.yaml
baiduyun:
client-id: xxxxxxxxxxxxxxxxxxxx
client-secret: xxxxxxxxxxxxxxxxxxxxxxx
  • 填入上述步骤获取的 API Key(client-id) 和 Secret Key(client-secret)

使用文本审核

/**
* 审核文本内容
*
* @param text 文本内容
* @return 是否审核通过
*/
private boolean handleTextScan(String text) {
try {
Map<String, Object> map = contentCensor.textCensor(text);
Short type = (Short) map.get("conclusionType");
if (type.equals(TextCensorResultEnum.NON_COMPLIANCE.type())) {
// 当前文本中存在违规内容
}
if (type.equals(TextCensorResultEnum.SUSPECTED.type())) {
// 当前文本中存在不确定性内容
}

if (type.equals(TextCensorResultEnum.AUDIT_FAILED.type())) {
// 自动审核失败,转人工审核
}

return type.equals(TextCensorResultEnum.COMPLIANCE.type());
} catch (Exception e) {
e.printStackTrace();
return false;
}
}

使用图像审核

关于接口需要数据有两种格式:

  • Base64字符串,需要编码之后的(本文使用)
  • 图片URL地址
/**
* 审核图片内容
*
* @param images 图片地址列表
* @return 是否审核通过
*/
private boolean handleImageScan(List<String> images) {
List<String> imgParams = new ArrayList<>();
// 图片去重
images = images.stream().distinct().collect(Collectors.toList());
for (String image : images) {
// 下载图片,从url中获取为字节数据
byte[] imgData = downLoadFile(image);
// 转为Base64,百度云接口要求
String imgStr = Base64Util.encode(imgData);
String imgParam = URLEncoder.encode(imgStr, StandardCharsets.UTF_8);
imgParams.add(imgParam);
}

try {
Map<String, Object> map = contentCensor.imgCensor(imgParams);
Short type = (Short) map.get("conclusionType");

if (type.equals(TextCensorResultEnum.NON_COMPLIANCE.type())) {
// 存在违规内容
}
if (type.equals(TextCensorResultEnum.SUSPECTED.type())) {
// 图片疑似有违规内容
}
if (type.equals(TextCensorResultEnum.AUDIT_FAILED.type())) {
// 自动审核失败,转人工审核
}
return type.equals(TextCensorResultEnum.COMPLIANCE.type());
} catch (Exception e) {
e.printStackTrace();
return false;
}
}

Base64转换工具

Base64Util
/**
* Base64 工具类
*/
public class Base64Util {
private static final char last2byte = (char) Integer.parseInt("00000011", 2);
private static final char last4byte = (char) Integer.parseInt("00001111", 2);
private static final char last6byte = (char) Integer.parseInt("00111111", 2);
private static final char lead6byte = (char) Integer.parseInt("11111100", 2);
private static final char lead4byte = (char) Integer.parseInt("11110000", 2);
private static final char lead2byte = (char) Integer.parseInt("11000000", 2);
private static final char[] encodeTable = new char[]{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};

public Base64Util() {
}

public static String encode(byte[] from) {
StringBuilder to = new StringBuilder((int) ((double) from.length * 1.34D) + 3);
int num = 0;
char currentByte = 0;

int i;
for (i = 0; i < from.length; ++i) {
for (num %= 8; num < 8; num += 6) {
switch (num) {
case 0:
currentByte = (char) (from[i] & lead6byte);
currentByte = (char) (currentByte >>> 2);
case 1:
case 3:
case 5:
default:
break;
case 2:
currentByte = (char) (from[i] & last6byte);
break;
case 4:
currentByte = (char) (from[i] & last4byte);
currentByte = (char) (currentByte << 2);
if (i + 1 < from.length) {
currentByte = (char) (currentByte | (from[i + 1] & lead2byte) >>> 6);
}
break;
case 6:
currentByte = (char) (from[i] & last2byte);
currentByte = (char) (currentByte << 4);
if (i + 1 < from.length) {
currentByte = (char) (currentByte | (from[i + 1] & lead4byte) >>> 4);
}
}

to.append(encodeTable[currentByte]);
}
}

if (to.length() % 4 != 0) {
for (i = 4 - to.length() % 4; i > 0; --i) {
to.append("=");
}
}

return to.toString();
}
}

自维护敏感词汇

可以使用DFA算法,使用确定有穷自动计机(一种数据结构)。