The WARC files are from the Common Crawl. A sample:
WARC-Type: response
WARC-Date: 2018-12-09T20:26:32Z
WARC-Record-ID: <urn:uuid:5e578aa4-4ec1-4b48-a3ff-cc0a154660f8>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=windows-1251">
<meta http-equiv="Content
Environment: I am using C++ on VS 2019 in Windows. I prefer not to use special libraries. I looked at mapping the file in memory but someone said it is not as fast when you are simply parsing the file sequentially. Because I am in windows, I get a nice GUI but I also get all the Unicode mess.
Desired output of the parse: I have an output file where I want to save most of the text and some of the tags. I will be discarding most of the input. Some warc tags signal to the parser that it can skip ahead say 500 characters. For example, any "WARC-Type" other than "WARC-Type: response" can skip ahead a known amount.
What I have tried: reading the file into a heap buffer and then slicing the buffer using a sliding window. skip/save based on content of window. Able to catch a tag that spans buffers. Eventually I will use things like regex and string::find to match tags and text.
The big problem: Unicode. The file is UTF-8 with all kinds of interesting characters as you can imagine. I use MultiByteToWideChar. If I only convert the window, I don't use as much memory but I run into problems with text lining up. 15 characters of UTF-8 does not produce 15 characters of ANSI. Depending on my multibtyetowidechar flags, size, etc. I get text skipped, decode errors, etc. If I convert the entire buffer and then slice into a window, I am using twice as much memory. not that big of a deal but it seems inefficient.
BOOL pageActive = FALSE;
BOOL xml = FALSE;
#define MAXBUFFERSIZE 1024
#define MAXTAGSIZE 64
DWORD windowStart = 0; DWORD windowEnd = 15; DWORD windowSize = 15;
DWORD bufferSize = MAXBUFFERSIZE;
_int64 fileRemaining;
HANDLE hFile;
DWORD dwBytesRead = 0;
LARGE_INTEGER dwPosition;
TCHAR* buffer;
hFile = CreateFile(
inputFilePath, // file to open
GENERIC_READ, // open for reading
FILE_SHARE_READ, // share for reading
NULL, // default security
OPEN_EXISTING, // existing file only
FILE_ATTRIBUTE_NORMAL, // normal file | FILE_FLAG_OVERLAPPED
NULL); // no attr. template
if (hFile == INVALID_HANDLE_VALUE)
{
DisplayErrorBox((LPWSTR)L"CreateFile");
return 0;
}
LARGE_INTEGER size;
GetFileSizeEx(hFile, &size);
_int64 fileSize = (__int64)size.QuadPart;
if(fileSize > MAXBUFFERSIZE){buffer = new TCHAR[MAXBUFFERSIZE];}
else{buffer = new TCHAR[fileSize];}
fileRemaining = fileSize;
while (fileRemaining) // outer loop
{
if (bufferSize > fileRemaining)
bufferSize = fileRemaining;
if (FALSE == ReadFile(hFile, buffer, bufferSize -1, &dwBytesRead, NULL))
{
sendToReportWindow(L"file read failed\n");
CloseHandle(hFile);
return 0;
}
fileRemaining -= bufferSize;
while (windowEnd < bufferSize) //inner loop. while unused data remains in buffer
{
windowSize = windowEnd - windowStart;
wstring str;
for (int i = windowStart; i <= windowEnd; i++) {
str.append(&buffer[i]);
}
str[windowEnd] = '\0';
TCHAR converted[MAXTAGSIZE] = { 0 };
MultiByteToWideChar(CP_ACP, MB_COMPOSITE, (LPCCH)str.c_str(), -1, converted, MAXTAGSIZE);
sendToReportWindow(L"windowStart:%d windowEnd:%d converted:%s\n", windowStart, windowEnd, converted);
// skip these sections and continue to build the window unless there are sufficient characters in the window to make it worth checking its content.
// to do: WARC page state (use regex and string::find)
// allow skipping ~500 chars in some cases
// to do: XML tags/text
// keep track of tags that span multiple buffers
windowStart = windowEnd;
windowEnd++;
} // inner loop
delete [] buffer;
} // outer loop