diff --git a/src/zimlib/include/zim/cache.h b/src/zimlib/include/zim/cache.h index 6c85fc2..51a889f 100644 --- a/src/zimlib/include/zim/cache.h +++ b/src/zimlib/include/zim/cache.h @@ -20,109 +20,327 @@ #ifndef ZIM_CACHE_H #define ZIM_CACHE_H -#include -#include +#include +#include +#include namespace zim { + /** + Implements a container for caching elements. + + The cache holds a list of key-value-pairs. There are 2 main operations for + accessing the cache: put and get. Put takes a key and a value and puts the + element into the list. Get takes a key and optional a value. If the value + for the key is found, it is returned. The passed value otherwise. By + default the value is constructed with the empty ctor of the value-type. + + The cache has a maximum size, after which key-value-pairs are dropped, + when a new item is put into the cache. + + The algorithm for this cache is as follows: + - when the cache is not full, new elements are appended + - new elements are put into the middle of the list otherwise + - the last element of the list is then dropped + - when getting a value and the value is found, it is put to the + beginning of the list + + When elements are searched, a linear search is done using the ==-operator + of the key type. + + The caching algorithm keeps elements, which are fetched more than once in + the first half of the list. In the second half the elements are either new + or the elements are pushed from the first half to the second half by other + elements, which are found in the cache. + + You should be aware, that the key type should be simple. Comparing keys + must be cheap. Copying elements (both key and value) must be possible and + should be cheap, since they are moved in the underlying container. + + */ template class Cache { - typedef std::deque > DataType; + struct Data + { + bool winner; + unsigned serial; + Value value; + Data() { } + Data(bool winner_, unsigned serial_, const Value& value_) + : winner(winner_), + serial(serial_), + value(value_) + { } + }; + + typedef std::map DataType; DataType data; + typename DataType::size_type maxElements; + unsigned serial; unsigned hits; unsigned misses; + unsigned _nextSerial() + { + if (serial == std::numeric_limits::max()) + { + for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + it->second.serial = 0; + serial = 1; + } + + return serial++; + } + + typename DataType::iterator _getOldest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial < foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + typename DataType::iterator _getNewest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial > foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + // drop one element + void _dropLooser() + { + // look for the oldest element in the list of loosers to drop it + data.erase(_getOldest(false)); + } + + void _makeLooser() + { + // look for the oldest element in the list of winners to make it a looser + typename DataType::iterator it = _getOldest(true); + it->second.winner = false; + it->second.serial = _nextSerial(); + } + public: typedef typename DataType::size_type size_type; - typedef typename DataType::value_type value_type; - typedef typename DataType::iterator iterator; - typedef typename DataType::const_iterator const_iterator; + typedef Value value_type; explicit Cache(size_type maxElements_) - : maxElements(maxElements_), + : maxElements(maxElements_ + (maxElements_ & 1)), + serial(0), hits(0), misses(0) { } + /// returns the number of elements currently in the cache size_type size() const { return data.size(); } - iterator begin() { return data.begin(); } - iterator end() { return data.end(); } - const_iterator begin() const { return data.begin(); } - const_iterator end() const { return data.end(); } - size_type getMaxElements() const { return maxElements; } + /// returns the maximum number of elements in the cache + size_type getMaxElements() const { return maxElements; } + void setMaxElements(size_type maxElements_) { - maxElements = maxElements_; - if (data.size() > maxElements) - data.erase(data.begin() + maxElements, data.end()); - } + size_type numWinners = size() < maxElements / 2 ? size() : maxElements / 2; - bool erase(const Key& key) // returns true, if key was found and removed - { - for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + maxElements_ += (maxElements_ & 1); + + if (maxElements_ > maxElements) { - if (it->first == key) + maxElements = maxElements_; + + while (numWinners < maxElements / 2) { - data.erase(it); - return true; + _getNewest(false)->winner = true; + ++numWinners; + } + } + else + { + while (maxElements > maxElements_) + { + _dropLooser(); + _dropLooser(); + _makeLooser(); + maxElements -= 2; + } + + while (numWinners > maxElements / 2) + { + _getNewest(true)->winner = false; + --numWinners; } } - return false; } + /// removes a element from the cache and returns true, if found + bool erase(const Key& key) + { + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return false; + + if (it->second.winner) + _getNewest(false)->winner=true; + + data.erase(it); + return true; + } + + /// clears the cache. + void clear(bool stats = false) + { + data.clear(); + if (stats) + hits = misses = 0; + } + + /// puts a new element in the cache. If the element is already found in + /// the cache, it is considered a cache hit and pushed to the top of the + /// list. void put(const Key& key, const Value& value) { - for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + typename DataType::iterator it; + if (data.size() < maxElements) + { + data.insert(data.begin(), + typename DataType::value_type(key, + Data(data.size() < maxElements / 2, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(false, _nextSerial(), value))); + } + else { - if (it->first == key) + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) { - data.erase(it); - data.push_front(typename DataType::value_type(key, value)); - return; + // move element to the winner part + it->second.winner = true; + _makeLooser(); } } + } - ++misses; + /// puts a new element on the top of the cache. If the element is already + /// found in the cache, it is considered a cache hit and pushed to the + /// top of the list. This method actually overrides the need, that a element + /// needs a hit to get to the top of the cache. + void put_top(const Key& key, const Value& value) + { + typename DataType::iterator it; + if (data.size() < maxElements) + { + if (data.size() >= maxElements / 2) + _makeLooser(); - if (data.size() < maxElements / 2) - data.push_back(typename DataType::value_type(key, value)); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + _makeLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } else - data.insert(data.begin() + maxElements / 2, typename DataType::value_type(key, value)); - - if (data.size() > maxElements) - data.pop_back(); + { + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + } } - std::pair getx(const Key& key, Value def = Value()) + Value* getptr(const Key& key) { - for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return 0; + + it->second.serial = _nextSerial(); + + if (!it->second.winner) { - if (it->first == key) - { - typename DataType::value_type v = *it; - data.erase(it); - data.push_front(v); - ++hits; - return std::pair(true, v.second); - } + // move element to the winner part + it->second.winner = true; + _makeLooser(); } - return std::pair(false, def); + return &it->second.value; + } + + /// returns a pair of values - a flag, if the value was found and the + /// value if found or the passed default otherwise. If the value is + /// found it is a cahce hit and pushed to the top of the list. + std::pair getx(const Key& key, Value def = Value()) + { + Value* v = getptr(key); + return v ? std::pair(true, *v) + : std::pair(false, def); } + /// returns the value to a key or the passed default value if not found. + /// If the value is found it is a cahce hit and pushed to the top of the + /// list. Value get(const Key& key, Value def = Value()) { return getx(key, def).second; } + /// returns the number of hits. unsigned getHits() const { return hits; } + /// returns the number of misses. unsigned getMisses() const { return misses; } + /// returns the cache hit ratio between 0 and 1. double hitRatio() const { return hits+misses > 0 ? static_cast(hits)/static_cast(hits+misses) : 0; } + /// returns the ratio, between held elements and maximum elements. double fillfactor() const { return static_cast(data.size()) / static_cast(maxElements); } + +/* + void dump(std::ostream& out) const + { + out << "cache max size=" << maxElements << " current size=" << size() << '\n'; + for (typename DataType::const_iterator it = data.begin(); it != data.end(); ++it) + { + out << "\tkey=\"" << it->first << "\" value=\"" << it->second.value << "\" serial=" << it->second.serial << " winner=" << it->second.winner << '\n'; + } + out << "--------\n"; + } +*/ + }; + } + #endif // ZIM_CACHE_H diff --git a/src/zimlib/include/zim/dirent.h b/src/zimlib/include/zim/dirent.h index 514db57..775e411 100644 --- a/src/zimlib/include/zim/dirent.h +++ b/src/zimlib/include/zim/dirent.h @@ -44,7 +44,15 @@ namespace zim std::string parameter; public: - Dirent() {} + Dirent() + : redirect(false), + mimeType(0), + version(0), + clusterNumber(0), + blobNumber(0), + redirectIndex(0), + ns('\0') + {} bool isRedirect() const { return redirect; } uint16_t getMimeType() const { return mimeType; } diff --git a/src/zimlib/include/zim/file.h b/src/zimlib/include/zim/file.h index 5ef927e..a6ac75b 100644 --- a/src/zimlib/include/zim/file.h +++ b/src/zimlib/include/zim/file.h @@ -44,6 +44,7 @@ namespace zim const std::string& getFilename() const { return impl->getFilename(); } const Fileheader& getFileheader() const { return impl->getFileheader(); } + offset_type getFilesize() const { return impl->getFilesize(); } Dirent getDirent(size_type idx) { return impl->getDirent(idx); } Dirent getDirentByTitle(size_type idx) { return impl->getDirentByTitle(idx); } @@ -86,6 +87,7 @@ namespace zim const_iterator find(const std::string& url); bool good() const { return impl.getPointer() != 0; } + time_t getMTime() const { return impl->getMTime(); } const std::string& getMimeType(uint16_t idx) const { return impl->getMimeType(idx); } diff --git a/src/zimlib/include/zim/fileheader.h b/src/zimlib/include/zim/fileheader.h index 85c7630..a34ed72 100644 --- a/src/zimlib/include/zim/fileheader.h +++ b/src/zimlib/include/zim/fileheader.h @@ -20,12 +20,6 @@ #ifndef ZIM_FILEHEADER_H #define ZIM_FILEHEADER_H -#ifdef _WIN32 -#ifdef max -#undef max -#endif -#endif - #include #include #include diff --git a/src/zimlib/include/zim/fileimpl.h b/src/zimlib/include/zim/fileimpl.h index 075367c..1cf584d 100644 --- a/src/zimlib/include/zim/fileimpl.h +++ b/src/zimlib/include/zim/fileimpl.h @@ -55,8 +55,11 @@ namespace zim public: explicit FileImpl(const char* fname); + time_t getMTime() const { return zimFile.getMTime(); } + const std::string& getFilename() const { return filename; } const Fileheader& getFileheader() const { return header; } + offset_type getFilesize() const { return zimFile.fsize(); } Dirent getDirent(size_type idx); Dirent getDirentByTitle(size_type idx); diff --git a/src/zimlib/include/zim/fstream.h b/src/zimlib/include/zim/fstream.h index faf2bcb..970920e 100644 --- a/src/zimlib/include/zim/fstream.h +++ b/src/zimlib/include/zim/fstream.h @@ -68,6 +68,8 @@ namespace zim void setCurrentFile(const std::string& fname, zim::offset_type off); + mutable time_t mtime; + public: streambuf(const std::string& fname, unsigned bufsize, unsigned openFilesCache); @@ -75,6 +77,7 @@ namespace zim void setBufsize(unsigned s) { buffer.resize(s); } zim::offset_type fsize() const; + time_t getMTime() const; }; class ifstream : public std::istream @@ -92,6 +95,7 @@ namespace zim void seekg(zim::offset_type off) { myStreambuf.seekg(off); } void setBufsize(unsigned s) { myStreambuf.setBufsize(s); } zim::offset_type fsize() const { return myStreambuf.fsize(); } + time_t getMTime() const { return myStreambuf.getMTime(); } }; } diff --git a/src/zimlib/src/fstream.cpp b/src/zimlib/src/fstream.cpp index 844c00e..df7db82 100644 --- a/src/zimlib/src/fstream.cpp +++ b/src/zimlib/src/fstream.cpp @@ -27,6 +27,9 @@ #include #include #include +#ifdef WITH_CXXTOOLS +#include +#endif #ifdef _WIN32 #include @@ -184,7 +187,8 @@ namespace streambuf::streambuf(const std::string& fname, unsigned bufsize, unsigned noOpenFiles) : buffer(bufsize), - openFilesCache(noOpenFiles) + openFilesCache(noOpenFiles), + mtime(0) { log_debug("streambuf for " << fname << " with " << bufsize << " bytes"); @@ -287,4 +291,33 @@ zim::offset_type streambuf::fsize() const return o; } +time_t streambuf::getMTime() const +{ + if (mtime || files.empty()) + return mtime; + + const char* fname = files.front()->fname.c_str(); + +#ifdef HAVE_STAT64 + struct stat64 st; + int ret = ::stat64(fname, &st); +#else + struct stat st; + int ret = ::stat(fname, &st); +#endif + if (ret != 0) +#ifdef WITH_CXXTOOLS + throw cxxtools::SystemError("stat"); +#else + { + std::ostringstream msg; + msg << "stat failed with errno " << errno << " : " << strerror(errno); + throw std::runtime_error(msg.str()); + } +#endif + mtime = st.st_mtime; + + return mtime; +} + } diff --git a/src/zimlib/src/uuid.cpp b/src/zimlib/src/uuid.cpp index 08d5d63..2762b3d 100644 --- a/src/zimlib/src/uuid.cpp +++ b/src/zimlib/src/uuid.cpp @@ -19,7 +19,6 @@ #include #include -//#include #include #include // necessary to have the new types #include "log.h" @@ -42,7 +41,6 @@ int gettimeofday(struct timeval* tp, void* tzp) { #define getpid GetCurrentProcessId #else - #include # include #endif diff --git a/src/zimlib/src/zintstream.cpp b/src/zimlib/src/zintstream.cpp index 4c423fc..6ce9259 100644 --- a/src/zimlib/src/zintstream.cpp +++ b/src/zimlib/src/zintstream.cpp @@ -18,6 +18,7 @@ */ #include +#include #include "log.h" log_define("zim.zintstream")