-
Notifications
You must be signed in to change notification settings - Fork 399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: collect files in PV when another container restart or stop #2010
base: main
Are you sure you want to change the base?
Changes from 9 commits
7f664ff
1e93f3c
e935e91
de42128
3121ec6
a190069
6d73a06
bfd097c
ae32247
230d519
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -206,6 +206,7 @@ void CheckPointManager::LoadFileCheckPoint(const Json::Value& root) { | |
string realFilePath; | ||
int32_t fileOpenFlag = 0; // default, we close file ptr | ||
int32_t containerStopped = 0; | ||
string containerID; | ||
int32_t lastForceRead = 0; | ||
int32_t idxInReaderArray = LogFileReader::CHECKPOINT_IDX_OF_NEW_READER_IN_ARRAY; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. UT、E2E重点找迅飞review下。 |
||
if (meta.isMember("real_file_name")) { | ||
|
@@ -242,6 +243,9 @@ void CheckPointManager::LoadFileCheckPoint(const Json::Value& root) { | |
if (meta.isMember("container_stopped")) { | ||
containerStopped = meta["container_stopped"].asInt(); | ||
} | ||
if (meta.isMember("container_id")) { | ||
containerID = meta["container_id"].asString(); | ||
} | ||
if (meta.isMember("last_force_read")) { | ||
lastForceRead = meta["last_force_read"].asInt(); | ||
} | ||
|
@@ -267,6 +271,7 @@ void CheckPointManager::LoadFileCheckPoint(const Json::Value& root) { | |
realFilePath, | ||
fileOpenFlag != 0, | ||
containerStopped != 0, | ||
containerID, | ||
lastForceRead != 0); | ||
ptr->mLastUpdateTime = update_time; | ||
ptr->mIdxInReaderArray = idxInReaderArray; | ||
|
@@ -300,6 +305,7 @@ void CheckPointManager::LoadFileCheckPoint(const Json::Value& root) { | |
realFilePath, | ||
fileOpenFlag != 0, | ||
containerStopped != 0, | ||
containerID, | ||
lastForceRead != 0); | ||
ptr->mLastUpdateTime = update_time; | ||
ptr->mIdxInReaderArray = idxInReaderArray; | ||
|
@@ -345,6 +351,7 @@ bool CheckPointManager::DumpCheckPointToLocal() { | |
leaf["dev"] = Json::Value(Json::UInt64(checkPointPtr->mDevInode.dev)); | ||
leaf["file_open"] = Json::Value(checkPointPtr->mFileOpenFlag ? 1 : 0); | ||
leaf["container_stopped"] = Json::Value(checkPointPtr->mContainerStopped ? 1 : 0); | ||
leaf["container_id"] = Json::Value(checkPointPtr->mContainerID); | ||
leaf["last_force_read"] = Json::Value(checkPointPtr->mLastForceRead ? 1 : 0); | ||
leaf["config_name"] = Json::Value(checkPointPtr->mConfigName); | ||
// forward compatible | ||
|
@@ -375,6 +382,7 @@ bool CheckPointManager::DumpCheckPointToLocal() { | |
leaf["dev"] = Json::Value(Json::UInt64(checkPointPtr->mDevInode.dev)); | ||
leaf["file_open"] = Json::Value(checkPointPtr->mFileOpenFlag ? 1 : 0); | ||
leaf["container_stopped"] = Json::Value(checkPointPtr->mContainerStopped ? 1 : 0); | ||
leaf["container_id"] = Json::Value(checkPointPtr->mContainerID); | ||
leaf["last_force_read"] = Json::Value(checkPointPtr->mLastForceRead ? 1 : 0); | ||
leaf["config_name"] = Json::Value(checkPointPtr->mConfigName); | ||
// forward compatible | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -529,6 +529,9 @@ void ModifyHandler::Handle(const Event& event) { | |
for (auto& pair : mNameReaderMap) { | ||
LogFileReaderPtrArray& readerArray = pair.second; | ||
for (auto& reader : readerArray) { | ||
if (reader->GetContainerID() != event.GetContainerID()) { | ||
continue; | ||
} | ||
reader->SetContainerStopped(); | ||
if (reader->IsReadToEnd() || reader->ShouldForceReleaseDeletedFileFd()) { | ||
if (reader->IsFileOpened()) { | ||
|
@@ -539,7 +542,7 @@ void ModifyHandler::Handle(const Event& event) { | |
"project", reader->GetProject())("logstore", reader->GetLogstore())( | ||
"config", mConfigName)("log reader queue name", reader->GetHostLogPath())( | ||
"file device", reader->GetDevInode().dev)("file inode", reader->GetDevInode().inode)( | ||
"file size", reader->GetFileSize())); | ||
"file size", reader->GetFileSize())("container id", event.GetContainerID())); | ||
if (!readerArray[0]->ShouldForceReleaseDeletedFileFd() && reader->HasDataInCache()) { | ||
ForceReadLogAndPush(readerArray[0]); | ||
} | ||
|
@@ -785,16 +788,26 @@ void ModifyHandler::Handle(const Event& event) { | |
"file size", reader->GetFileSize())); | ||
reader->CloseFilePtr(); | ||
} else if (reader->IsContainerStopped()) { | ||
// release fd as quick as possible | ||
LOG_INFO( | ||
sLogger, | ||
("close the file", "current file has been read, and the relative container has been stopped")( | ||
"project", reader->GetProject())("logstore", reader->GetLogstore())("config", mConfigName)( | ||
"log reader queue name", reader->GetHostLogPath())("file device", | ||
reader->GetDevInode().dev)( | ||
"file inode", reader->GetDevInode().inode)("file size", reader->GetFileSize())); | ||
ForceReadLogAndPush(reader); | ||
reader->CloseFilePtr(); | ||
// update container info one more time, ensure file is hold by same cotnainer | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cotnainer -> container |
||
if (reader->UpdateContainerInfo() && !reader->IsContainerStopped()) { | ||
LOG_INFO(sLogger, | ||
("file is reused by a new container", reader->GetContainerID())( | ||
"project", reader->GetProject())("logstore", reader->GetLogstore())( | ||
"config", mConfigName)("log reader queue name", reader->GetHostLogPath())( | ||
"file device", reader->GetDevInode().dev)( | ||
"file inode", reader->GetDevInode().inode)("file size", reader->GetFileSize())); | ||
} else { | ||
// release fd as quick as possible | ||
LOG_INFO(sLogger, | ||
("close the file", | ||
"current file has been read, and the relative container has been stopped")( | ||
"project", reader->GetProject())("logstore", reader->GetLogstore())( | ||
"config", mConfigName)("log reader queue name", reader->GetHostLogPath())( | ||
"file device", reader->GetDevInode().dev)( | ||
"file inode", reader->GetDevInode().inode)("file size", reader->GetFileSize())); | ||
ForceReadLogAndPush(reader); | ||
reader->CloseFilePtr(); | ||
} | ||
} | ||
break; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -122,6 +122,7 @@ LogFileReader* LogFileReader::CreateLogFileReader(const string& hostLogPathDir, | |
? discoveryConfig.first->GetWildcardPaths()[0] | ||
: discoveryConfig.first->GetBasePath(), | ||
containerPath->mRealBaseDir.size()); | ||
reader->SetContainerID(containerPath->mID); | ||
reader->AddExtraTags(containerPath->mMetadatas); | ||
reader->AddExtraTags(containerPath->mTags); | ||
} | ||
|
@@ -265,6 +266,7 @@ void LogFileReader::DumpMetaToMem(bool checkConfigFlag, int32_t idxInReaderArray | |
mRealLogPath, | ||
mLogFileOp.IsOpen(), | ||
mContainerStopped, | ||
mContainerID, | ||
mLastForceRead); | ||
// use last event time as checkpoint's last update time | ||
checkPointPtr->mLastUpdateTime = mLastEventTime; | ||
|
@@ -312,16 +314,23 @@ void LogFileReader::InitReader(bool tailExisted, FileReadPolicy policy, uint32_t | |
mLastFileSignatureSize = checkPointPtr->mSignatureSize; | ||
mRealLogPath = checkPointPtr->mRealFileName; | ||
mLastEventTime = checkPointPtr->mLastUpdateTime; | ||
mContainerStopped = checkPointPtr->mContainerStopped; | ||
if (checkPointPtr->mContainerID == mContainerID) { | ||
mContainerStopped = checkPointPtr->mContainerStopped; | ||
} else { | ||
LOG_INFO( | ||
sLogger, | ||
("container id is different between container discovery and checkpoint", | ||
checkPointPtr->mRealFileName)("checkpoint", checkPointPtr->mContainerID)("current", mContainerID)); | ||
} | ||
// new property to recover reader exactly from checkpoint | ||
mIdxInReaderArrayFromLastCpt = checkPointPtr->mIdxInReaderArray; | ||
LOG_INFO(sLogger, | ||
("recover log reader status from checkpoint, project", GetProject())("logstore", GetLogstore())( | ||
"config", GetConfigName())("log reader queue name", mHostLogPath)("file device", | ||
ToString(mDevInode.dev))( | ||
"file inode", ToString(mDevInode.inode))("file signature", mLastFileSignatureHash)( | ||
"file signature size", mLastFileSignatureSize)("real file path", mRealLogPath)( | ||
"last file position", mLastFilePos)("index in reader array", mIdxInReaderArrayFromLastCpt)); | ||
"config", GetConfigName())("log reader queue name", mHostLogPath)( | ||
"file device", ToString(mDevInode.dev))("file inode", ToString(mDevInode.inode))( | ||
"file signature", mLastFileSignatureHash)("file signature size", mLastFileSignatureSize)( | ||
"real file path", mRealLogPath)("last file position", mLastFilePos)( | ||
"index in reader array", mIdxInReaderArrayFromLastCpt)("container id", mContainerID)); | ||
// if file is open or | ||
// last update time is new and the file's container is not stopped we | ||
// we should use first modify | ||
|
@@ -2521,6 +2530,31 @@ const std::string& LogFileReader::GetConvertedPath() const { | |
#endif | ||
} | ||
|
||
bool LogFileReader::UpdateContainerInfo() { | ||
FileDiscoveryConfig discoveryConfig = FileServer::GetInstance()->GetFileDiscoveryConfig(mConfigName); | ||
if (discoveryConfig.first == nullptr) { | ||
return false; | ||
} | ||
ContainerInfo* containerInfo = discoveryConfig.first->GetContainerPathByLogPath(mHostLogPathDir); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 可能有问题,如果一个是主机上采集pv,一个是容器内,那么主机的是不会去更新container信息的,此时还能否将stop的container重置为空? |
||
if (containerInfo && containerInfo->mID != mContainerID) { | ||
LOG_INFO(sLogger, | ||
("container info of file reader changed", "may be because container restart")( | ||
"old container id", mContainerID)("new container id", containerInfo->mID)( | ||
"container status", containerInfo->mStopped ? "stopped" : "running")); | ||
// if config have wildcard path, use mWildcardPaths[0] as base path | ||
SetDockerPath(!discoveryConfig.first->GetWildcardPaths().empty() ? discoveryConfig.first->GetWildcardPaths()[0] | ||
: discoveryConfig.first->GetBasePath(), | ||
containerInfo->mRealBaseDir.size()); | ||
SetContainerID(containerInfo->mID); | ||
mContainerStopped = containerInfo->mStopped; | ||
mExtraTags.clear(); | ||
AddExtraTags(containerInfo->mMetadatas); | ||
AddExtraTags(containerInfo->mTags); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
#ifdef APSARA_UNIT_TEST_MAIN | ||
void LogFileReader::UpdateReaderManual() { | ||
if (mLogFileOp.IsOpen()) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个场景E2E是否可以构造,是否有对应的用例?