11.3.3 commitlog文件的命名和管理
commitlog文件是延迟创建的,也就是说,用到的时候再创建,所以在CommitLog启动的时候并不会创建文件,既然创建的入口在使用时,通过上两节消息存储处理逻辑 putMessage/putMessages两个方法,可以确定,入口方法在 MappedFileQueue.getLastMappedFile,源码如下:
// 这个方法单单是获取commitlog文件的映射,如则返回,无则返回null
public MappedFile getLastMappedFile() {
MappedFile mappedFileLast = null;
while (!this.mappedFiles.isEmpty()) {
try {
mappedFileLast = this.mappedFiles.get(this.mappedFiles.size() - 1);
break;
} catch (IndexOutOfBoundsException e) {
//continue;
} catch (Exception e) {
log.error("getLastMappedFile has exception.", e);
break;
}
}
return mappedFileLast;
}
// 这个方法既有获取又有创建,对commitlog文件的创建,从这开始
public MappedFile getLastMappedFile(final long startOffset) {
return getLastMappedFile(startOffset, true);
}
// 创建的具体实现在这个方法中
public MappedFile getLastMappedFile(final long startOffset, boolean needCreate) {
long createOffset = -1;
// 保险起见,先从缓存中再取一次
MappedFile mappedFileLast = getLastMappedFile();
// 取不到,就根据开始写入的偏移值来确定文件名,因为commitlog文件的大小为1个G,那么commitlog文件命名就是0、1G、2G、3G、...、nG,并且把数字转换成20位的long型
if (mappedFileLast == null) {
createOffset = startOffset - (startOffset % this.mappedFileSize);
}
// 取到了,但是已经写满了,就要重新计算新文件的起始偏移,实际就是文件名
if (mappedFileLast != null && mappedFileLast.isFull()) {
createOffset = mappedFileLast.getFileFromOffset() + this.mappedFileSize;
}
// 不为 -1 并且 必须新建
if (createOffset != -1 && needCreate) {
// 确定文件路径
String nextFilePath = this.storePath + File.separator + UtilAll.offset2FileName(createOffset);
// 确定下一个要创建的文件路径,这里其实是属于提高效率,预创建
String nextNextFilePath = this.storePath + File.separator
+ UtilAll.offset2FileName(createOffset + this.mappedFileSize);
MappedFile mappedFile = null;
// allocateMappedFileService 创建commitlog及其映射的服务,是一个线程任务,这个类在构造函数MappedFileQueue中指定,也可以不传
if (this.allocateMappedFileService != null) {
// 将创建请求投放到 allocateMappedFileService 服务中,本节下文会讲这块实现
mappedFile = this.allocateMappedFileService.putRequestAndReturnMappedFile(nextFilePath,
nextNextFilePath, this.mappedFileSize);
} else {
try {
// 没有指定 allocateMappedFileService 服务,那就在当前线程中创建,但是只能创建一个,nextNextFilePath 的预创建就没有了,具体详情,看`章节11.3.3.1`
mappedFile = new MappedFile(nextFilePath, this.mappedFileSize);
} catch (IOException e) {
log.error("create mappedFile exception", e);
}
}
// 创建成功
if (mappedFile != null) {
// 放入缓存,如果当前缓存是空,就把当前 mappedFile设置为第一个创建
if (this.mappedFiles.isEmpty()) {
mappedFile.setFirstCreateInQueue(true);
}
// 添加到缓存
this.mappedFiles.add(mappedFile);
}
// 返回结果
return mappedFile;
}
return mappedFileLast;
}
allocateMappedFileService.putRequestAndReturnMappedFile
public MappedFile putRequestAndReturnMappedFile(String nextFilePath, String nextNextFilePath, int fileSize) {
// 默认一次只能提交2个创建commitlog的请求
int canSubmitRequests = 2;
// 这里是针对消息存储全部先暂存缓存的情况,rocketmq 默认不是这样的,所以,先忽略吧,有机会再讲
if (this.messageStore.getMessageStoreConfig().isTransientStorePoolEnable()) {
if (this.messageStore.getMessageStoreConfig().isFastFailIfNoBufferInStorePool()
&& BrokerRole.SLAVE != this.messageStore.getMessageStoreConfig().getBrokerRole()) { //if broker is slave, don't fast fail even no buffer in pool
canSubmitRequests = this.messageStore.getTransientStorePool().remainBufferNumbs() - this.requestQueue.size();
}
}
// 创建要写入的 commitlog 文件的请求
AllocateRequest nextReq = new AllocateRequest(nextFilePath, fileSize);
// 把请求丢到 map 中
boolean nextPutOK = this.requestTable.putIfAbsent(nextFilePath, nextReq) == null;
if (nextPutOK) {
if (canSubmitRequests <= 0) {
log.warn("[NOTIFYME]TransientStorePool is not enough, so create mapped file error, " +
"RequestQueueSize : {}, StorePoolSize: {}", this.requestQueue.size(), this.messageStore.getTransientStorePool().remainBufferNumbs());
this.requestTable.remove(nextFilePath);
return null;
}
// 把请求放请求队列中,然后由线程任务去执行异常创建,具体详情,看`章节11.3.3.2`
boolean offerOK = this.requestQueue.offer(nextReq);
if (!offerOK) {
log.warn("never expected here, add a request to preallocate queue failed");
}
// 可提交数 -1
canSubmitRequests--;
}
// 创建下一个要写入的 commitlog 文件的请求
AllocateRequest nextNextReq = new AllocateRequest(nextNextFilePath, fileSize);
// 把请求丢到 map 中
boolean nextNextPutOK = this.requestTable.putIfAbsent(nextNextFilePath, nextNextReq) == null;
if (nextNextPutOK) {
if (canSubmitRequests <= 0) {
log.warn("[NOTIFYME]TransientStorePool is not enough, so skip preallocate mapped file, " +
"RequestQueueSize : {}, StorePoolSize: {}", this.requestQueue.size(), this.messageStore.getTransientStorePool().remainBufferNumbs());
this.requestTable.remove(nextNextFilePath);
} else {
// 把请求放请求队列中
boolean offerOK = this.requestQueue.offer(nextNextReq);
if (!offerOK) {
log.warn("never expected here, add a request to preallocate queue failed");
}
}
}
if (hasException) {
log.warn(this.getServiceName() + " service has exception. so return null");
return null;
}
AllocateRequest result = this.requestTable.get(nextFilePath);
try {
// requestTable 中能取到值,证明是通过 allocateMappedFileService 服务来处理的,这是一个线程任务,异步的,所以这里需要等待 await 服务的完成
if (result != null) {
// 默认等待 5 秒
boolean waitOK = result.getCountDownLatch().await(waitTimeOut, TimeUnit.MILLISECONDS);
if (!waitOK) { // 等待超时
log.warn("create mmap timeout " + result.getFilePath() + " " + result.getFileSize());
return null;
} else {
// 成功处理,移除 requestTable 中该文件的创建
this.requestTable.remove(nextFilePath);
return result.getMappedFile(); // 返回文件映射
}
} else {
log.error("find preallocate mmap failed, this never happen");
}
} catch (InterruptedException e) {
log.warn(this.getServiceName() + " service has exception. ", e);
}
return null;
}
11.3.3.1 同步创建文件及映射
以下的源码都在类 MappedFile 中。
public MappedFile(final String fileName, final int fileSize) throws IOException {
init(fileName, fileSize);
}
private void init(final String fileName, final int fileSize) throws IOException {
this.fileName = fileName; // 文件名:20位Long型表示
this.fileSize = fileSize; // 文件大小:1G
this.file = new File(fileName); // 文件创建
// 文件名就是该文件起始偏移
this.fileFromOffset = Long.parseLong(this.file.getName());
boolean ok = false;
// 验证文件路径并创建目录
ensureDirOK(this.file.getParent());
try {
// 使用NIO中的随机读写文件通道
this.fileChannel = new RandomAccessFile(this.file, "rw").getChannel();
// 调用mmap将文件映射到内存
this.mappedByteBuffer = this.fileChannel.map(MapMode.READ_WRITE, 0, fileSize);
// 记录总共映射内存大小
TOTAL_MAPPED_VIRTUAL_MEMORY.addAndGet(fileSize);
// 记录总共映射文件个数/次数
TOTAL_MAPPED_FILES.incrementAndGet();
ok = true; // 表示成功
} catch (FileNotFoundException e) {
log.error("create file channel " + this.fileName + " Failed. ", e);
throw e;
} catch (IOException e) {
log.error("map file " + this.fileName + " Failed. ", e);
throw e;
} finally {
if (!ok && this.fileChannel != null) {
this.fileChannel.close();
}
}
}
11.3.3.2 异步创建文件及映射
以下源码都在类 AllocateMappedFileService 中,该类继承 ServiceThread,而 ServiceThread实现 Runnable
public void run() {
log.info(this.getServiceName() + " service started");
// 该任务一直 while循环,主要逻辑在 mmapOperation 中
while (!this.isStopped() && this.mmapOperation()) {
}
log.info(this.getServiceName() + " service end");
}
private boolean mmapOperation() {
boolean isSuccess = false;
AllocateRequest req = null;
try {
// 从队列中取出请求创建文件的任务
req = this.requestQueue.take();
// 再从缓存 map requestTable 中取出相应的创建任务,两者需要一致
AllocateRequest expectedRequest = this.requestTable.get(req.getFilePath());
if (null == expectedRequest) { // 缓存中没有
log.warn("this mmap request expired, maybe cause timeout " + req.getFilePath() + " "
+ req.getFileSize());
return true;
}
if (expectedRequest != req) { // 两者不一致
log.warn("never expected here, maybe cause timeout " + req.getFilePath() + " "
+ req.getFileSize() + ", req:" + req + ", expectedRequest:" + expectedRequest);
return true;
}
// 为 null 才需要创建
if (req.getMappedFile() == null) {
// 开始时间
long beginTime = System.currentTimeMillis();
MappedFile mappedFile;
// 是否暂存标志,忽略
if (messageStore.getMessageStoreConfig().isTransientStorePoolEnable()) {
try {
mappedFile = ServiceLoader.load(MappedFile.class).iterator().next();
mappedFile.init(req.getFilePath(), req.getFileSize(), messageStore.getTransientStorePool());
} catch (RuntimeException e) {
log.warn("Use default implementation.");
mappedFile = new MappedFile(req.getFilePath(), req.getFileSize(), messageStore.getTransientStorePool());
}
} else {
// 直接创建,,具体细节看`章节11.3.3.1`
mappedFile = new MappedFile(req.getFilePath(), req.getFileSize());
}
// 所产生的时间
long eclipseTime = UtilAll.computeEclipseTimeMilliseconds(beginTime);
// 大于 10ms 就要记录日志
if (eclipseTime > 10) {
int queueSize = this.requestQueue.size();
log.warn("create mappedFile spent time(ms) " + eclipseTime + " queue size " + queueSize
+ " " + req.getFilePath() + " " + req.getFileSize());
}
// 预热映射文件:将一些随机值(默认是0)预先写入mappedFile对应的内存页面,这样目标内存就会提前交换到物理内存。但事实上,这种行为会导致操作系统的IO加倍。第一次IO在预热MappedFile中,第二次IO在将消息数据写入页面缓存的过程中。RocketMQ 默认是不做预热的,但是为了让读者更清晰预热做了什么,这里还是讲一下,看下面的 warmMappedFile 方法
if (mappedFile.getFileSize() >= this.messageStore.getMessageStoreConfig()
.getMapedFileSizeCommitLog()
&&
this.messageStore.getMessageStoreConfig().isWarmMapedFileEnable()) {
mappedFile.warmMappedFile(this.messageStore.getMessageStoreConfig().getFlushDiskType(),
this.messageStore.getMessageStoreConfig().getFlushLeastPagesWhenWarmMapedFile());
}
// 成功后设置值
req.setMappedFile(mappedFile);
this.hasException = false;
isSuccess = true;
}
} catch (InterruptedException e) {
log.warn(this.getServiceName() + " interrupted, possibly by shutdown.");
this.hasException = true; // 中断异常,就直接返回异常
return false;
} catch (IOException e) {
log.warn(this.getServiceName() + " service has exception. ", e);
this.hasException = true;
if (null != req) {
requestQueue.offer(req);
try {
Thread.sleep(1); // IO异常,睡眠1ms重试
} catch (InterruptedException ignored) {
}
}
} finally { // 最终都会走到这
if (req != null && isSuccess)
// 将coundDown减1,主线程还等着呢
req.getCountDownLatch().countDown();
}
return true; // 返回成功
}
MappedFile.warmMappedFile
为什么需要预写0值进去呢,我的猜想是,commitlog文件命名是long的值,每创建一个文件就增加1G的值,总有一天会达到long的最大值,一但大达到最大值再加1G,文件名就是负数了,这样就会存在循环的问题,为了不让原文件的内容干扰,这就需要先将文件内容置0,仅限于我的猜想。
public void warmMappedFile(FlushDiskType type, int pages) {
long beginTime = System.currentTimeMillis();
ByteBuffer byteBuffer = this.mappedByteBuffer.slice();
int flush = 0;
long time = System.currentTimeMillis();
for (int i = 0, j = 0; i < this.fileSize; i += MappedFile.OS_PAGE_SIZE, j++) {
byteBuffer.put(i, (byte) 0); // 重点看这一行,填充0
// force flush when flush disk type is sync
if (type == FlushDiskType.SYNC_FLUSH) {
if ((i / OS_PAGE_SIZE) - (flush / OS_PAGE_SIZE) >= pages) {
flush = i;
mappedByteBuffer.force();
}
}
// 下面这一段是阻止gc的,其实没什么用,在rocketmq 5 版本后,这一段代码就注释掉了
// prevent gc
if (j % 1000 == 0) {
log.info("j={}, costTime={}", j, System.currentTimeMillis() - time);
time = System.currentTimeMillis();
try {
Thread.sleep(0);
} catch (InterruptedException e) {
log.error("Interrupted", e);
}
}
}
// force flush when prepare load finished
if (type == FlushDiskType.SYNC_FLUSH) {
log.info("mapped file warm-up done, force to disk, mappedFile={}, costTime={}",
this.getFileName(), System.currentTimeMillis() - beginTime);
mappedByteBuffer.force();
}
log.info("mapped file warm-up done. mappedFile={}, costTime={}", this.getFileName(),
System.currentTimeMillis() - beginTime);
this.mlock();
}
11.3.4 commitlog内容写入
写入操作内容实现在类 MappedFile 中,具体往下看
public AppendMessageResult appendMessage(final MessageExtBrokerInner msg, final AppendMessageCallback cb) {
return appendMessagesInner(msg, cb);
}
public AppendMessageResult appendMessagesInner(final MessageExt messageExt, final AppendMessageCallback cb) {
assert messageExt != null;
assert cb != null;
// 拿到上一次操作的位置,并作为当前操作点
int currentPos = this.wrotePosition.get();
// 操作位置要小于文件大小
if (currentPos < this.fileSize) {
// 防止对已写内容的修改,需要对mappedByteBuffer切片,什么叫切片,其实就是取出pos到limit这一段内容作为写的范围。这里为啥会有 writeBuffer 和 mappedByteBuffer呢,writeBuffer 是暂存池使用,rocketmq默认不使用,所以这里直接取 mappedByteBuffer 就行。
ByteBuffer byteBuffer = writeBuffer != null ? writeBuffer.slice() : this.mappedByteBuffer.slice();
byteBuffer.position(currentPos);
AppendMessageResult result = null;
if (messageExt instanceof MessageExtBrokerInner) {
// 实际写入的方法,看下面 DefaultAppendMessageCallback.doAppend
result = cb.doAppend(this.getFileFromOffset(), byteBuffer, this.fileSize - currentPos, (MessageExtBrokerInner) messageExt);
} else if (messageExt instanceof MessageExtBatch) {
// 实际写入的方法,看下面 DefaultAppendMessageCallback.doAppend
result = cb.doAppend(this.getFileFromOffset(), byteBuffer, this.fileSize - currentPos, (MessageExtBatch) messageExt);
} else {
// 其他情况,返回未知错误
return new AppendMessageResult(AppendMessageStatus.UNKNOWN_ERROR);
}
// 增加写入位置
this.wrotePosition.addAndGet(result.getWroteBytes());
// 写入时间
this.storeTimestamp = result.getStoreTimestamp();
return result;
}
log.error("MappedFile.appendMessage return null, wrotePosition: {} fileSize: {}", currentPos, this.fileSize);
return new AppendMessageResult(AppendMessageStatus.UNKNOWN_ERROR);
}
DefaultAppendMessageCallback.doAppend
public AppendMessageResult doAppend(final long fileFromOffset, final ByteBuffer byteBuffer, final int maxBlank,
final MessageExtBrokerInner msgInner) {
// STORETIMESTAMP + STOREHOSTADDRESS + OFFSET <br>
// PHY OFFSET commitlog 的物理偏移
long wroteOffset = fileFromOffset + byteBuffer.position();
this.resetByteBuffer(hostHolder, 8);
// 通过broker 主机地址+当前写入偏移,组成msgId
String msgId = MessageDecoder.createMessageId(this.msgIdMemory, msgInner.getStoreHostBytes(hostHolder), wroteOffset);
// 记录 ConsumeQueue 信息:{topic}-{queueId}
keyBuilder.setLength(0);
keyBuilder.append(msgInner.getTopic());
keyBuilder.append('-');
keyBuilder.append(msgInner.getQueueId());
String key = keyBuilder.toString();
// 从缓存中取出当前cq写入的偏移值
Long queueOffset = CommitLog.this.topicQueueTable.get(key);
if (null == queueOffset) {// 初次写入,偏移为0
queueOffset = 0L;
// 将偏移位置记录到缓存中
CommitLog.this.topicQueueTable.put(key, queueOffset);
}
// 针对事务消息,忽略
final int tranType = MessageSysFlag.getTransactionValue(msgInner.getSysFlag());
switch (tranType) {
// Prepared and Rollback message is not consumed, will not enter the
// consumer queuec
case MessageSysFlag.TRANSACTION_PREPARED_TYPE:
case MessageSysFlag.TRANSACTION_ROLLBACK_TYPE:
queueOffset = 0L;
break;
case MessageSysFlag.TRANSACTION_NOT_TYPE:
case MessageSysFlag.TRANSACTION_COMMIT_TYPE:
default:
break;
}
// 序列化消息:全部转成二进制
final byte[] propertiesData =
msgInner.getPropertiesString() == null ? null : msgInner.getPropertiesString().getBytes(MessageDecoder.CHARSET_UTF8);
final int propertiesLength = propertiesData == null ? 0 : propertiesData.length;
if (propertiesLength > Short.MAX_VALUE) {
log.warn("putMessage message properties length too long. length={}", propertiesData.length);
return new AppendMessageResult(AppendMessageStatus.PROPERTIES_SIZE_EXCEEDED);
}
final byte[] topicData = msgInner.getTopic().getBytes(MessageDecoder.CHARSET_UTF8);
final int topicLength = topicData.length;
final int bodyLength = msgInner.getBody() == null ? 0 : msgInner.getBody().length;
final int msgLen = calMsgLength(bodyLength, topicLength, propertiesLength);
// Exceeds the maximum message
if (msgLen > this.maxMessageSize) {
CommitLog.log.warn("message size exceeded, msg total size: " + msgLen + ", msg body size: " + bodyLength
+ ", maxMessageSize: " + this.maxMessageSize);
return new AppendMessageResult(AppendMessageStatus.MESSAGE_SIZE_EXCEEDED);
}
// 判断消息+结束空白符,是否超过文件可写范围,如超过,则返回 AppendMessageStatus.END_OF_FILE 错误
if ((msgLen + END_FILE_MIN_BLANK_LENGTH) > maxBlank) {
this.resetByteBuffer(this.msgStoreItemMemory, maxBlank);
// 1 TOTALSIZE
this.msgStoreItemMemory.putInt(maxBlank);
// 2 MAGICCODE
this.msgStoreItemMemory.putInt(CommitLog.BLANK_MAGIC_CODE);
// 3 The remaining space may be any value
// Here the length of the specially set maxBlank
final long beginTimeMills = CommitLog.this.defaultMessageStore.now();
byteBuffer.put(this.msgStoreItemMemory.array(), 0, maxBlank);
return new AppendMessageResult(AppendMessageStatus.END_OF_FILE, wroteOffset, maxBlank, msgId, msgInner.getStoreTimestamp(),
queueOffset, CommitLog.this.defaultMessageStore.now() - beginTimeMills);
}
// 设置 commitlog 消息单元内容,内容讲解,参考`章节第十章`
this.resetByteBuffer(msgStoreItemMemory, msgLen);
// 1 TOTALSIZE
this.msgStoreItemMemory.putInt(msgLen);
// 2 MAGICCODE
this.msgStoreItemMemory.putInt(CommitLog.MESSAGE_MAGIC_CODE);
// 3 BODYCRC
this.msgStoreItemMemory.putInt(msgInner.getBodyCRC());
// 4 QUEUEID
this.msgStoreItemMemory.putInt(msgInner.getQueueId());
// 5 FLAG
this.msgStoreItemMemory.putInt(msgInner.getFlag());
// 6 QUEUEOFFSET
this.msgStoreItemMemory.putLong(queueOffset);
// 7 PHYSICALOFFSET
this.msgStoreItemMemory.putLong(fileFromOffset + byteBuffer.position());
// 8 SYSFLAG
this.msgStoreItemMemory.putInt(msgInner.getSysFlag());
// 9 BORNTIMESTAMP
this.msgStoreItemMemory.putLong(msgInner.getBornTimestamp());
// 10 BORNHOST
this.resetByteBuffer(hostHolder, 8);
this.msgStoreItemMemory.put(msgInner.getBornHostBytes(hostHolder));
// 11 STORETIMESTAMP
this.msgStoreItemMemory.putLong(msgInner.getStoreTimestamp());
// 12 STOREHOSTADDRESS
this.resetByteBuffer(hostHolder, 8);
this.msgStoreItemMemory.put(msgInner.getStoreHostBytes(hostHolder));
//this.msgBatchMemory.put(msgInner.getStoreHostBytes());
// 13 RECONSUMETIMES
this.msgStoreItemMemory.putInt(msgInner.getReconsumeTimes());
// 14 Prepared Transaction Offset
this.msgStoreItemMemory.putLong(msgInner.getPreparedTransactionOffset());
// 15 BODY
this.msgStoreItemMemory.putInt(bodyLength);
if (bodyLength > 0)
this.msgStoreItemMemory.put(msgInner.getBody());
// 16 TOPIC
this.msgStoreItemMemory.put((byte) topicLength);
this.msgStoreItemMemory.put(topicData);
// 17 PROPERTIES
this.msgStoreItemMemory.putShort((short) propertiesLength);
if (propertiesLength > 0)
this.msgStoreItemMemory.put(propertiesData);
final long beginTimeMills = CommitLog.this.defaultMessageStore.now();
// Write messages to the queue buffer
byteBuffer.put(this.msgStoreItemMemory.array(), 0, msgLen);
// 至此,消息内容都已写入commitlog的内存映射中,到时一刷盘就可以到磁盘了,刷盘操作在`第十二章`讲
AppendMessageResult result = new AppendMessageResult(AppendMessageStatus.PUT_OK, wroteOffset, msgLen, msgId,
msgInner.getStoreTimestamp(), queueOffset, CommitLog.this.defaultMessageStore.now() - beginTimeMills);
switch (tranType) {
case MessageSysFlag.TRANSACTION_PREPARED_TYPE:
case MessageSysFlag.TRANSACTION_ROLLBACK_TYPE:
break;
case MessageSysFlag.TRANSACTION_NOT_TYPE:
case MessageSysFlag.TRANSACTION_COMMIT_TYPE:
// 更新下一个 cq 要写入的位置到缓存
CommitLog.this.topicQueueTable.put(key, ++queueOffset);
break;
default:
break;
}
return result;
}