160 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			C++
		
	
	
	
| //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
 | ||
| //
 | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | ||
| // See https://llvm.org/LICENSE.txt for license information.
 | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | ||
| //
 | ||
| //===----------------------------------------------------------------------===//
 | ||
| //
 | ||
| // Lempel–Ziv–Welch encoding/decoding
 | ||
| //
 | ||
| //===----------------------------------------------------------------------===//
 | ||
| 
 | ||
| #ifndef SANITIZER_LZW_H
 | ||
| #define SANITIZER_LZW_H
 | ||
| 
 | ||
| #include "sanitizer_dense_map.h"
 | ||
| 
 | ||
| namespace __sanitizer {
 | ||
| 
 | ||
| using LzwCodeType = u32;
 | ||
| 
 | ||
| template <class T, class ItIn, class ItOut>
 | ||
| ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
 | ||
|   using Substring =
 | ||
|       detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
 | ||
| 
 | ||
|   // Sentinel value for substrings of len 1.
 | ||
|   static constexpr LzwCodeType kNoPrefix =
 | ||
|       Min(DenseMapInfo<Substring>::getEmptyKey().first,
 | ||
|           DenseMapInfo<Substring>::getTombstoneKey().first) -
 | ||
|       1;
 | ||
|   DenseMap<Substring, LzwCodeType> prefix_to_code;
 | ||
|   {
 | ||
|     // Add all substring of len 1 as initial dictionary.
 | ||
|     InternalMmapVector<T> dict_len1;
 | ||
|     for (auto it = begin; it != end; ++it)
 | ||
|       if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
 | ||
|         dict_len1.push_back(*it);
 | ||
| 
 | ||
|     // Slightly helps with later delta encoding.
 | ||
|     Sort(dict_len1.data(), dict_len1.size());
 | ||
| 
 | ||
|     // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
 | ||
|     // just generate them.
 | ||
|     *out = dict_len1.size();
 | ||
|     ++out;
 | ||
| 
 | ||
|     for (uptr i = 0; i != dict_len1.size(); ++i) {
 | ||
|       // Remap after the Sort.
 | ||
|       prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
 | ||
|       *out = dict_len1[i];
 | ||
|       ++out;
 | ||
|     }
 | ||
|     CHECK_EQ(prefix_to_code.size(), dict_len1.size());
 | ||
|   }
 | ||
| 
 | ||
|   if (begin == end)
 | ||
|     return out;
 | ||
| 
 | ||
|   // Main LZW encoding loop.
 | ||
|   LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
 | ||
|   ++begin;
 | ||
|   for (auto it = begin; it != end; ++it) {
 | ||
|     // Extend match with the new item.
 | ||
|     auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
 | ||
|     if (ins.second) {
 | ||
|       // This is a new substring, but emit the code for the current match
 | ||
|       // (before extend). This allows LZW decoder to recover the dictionary.
 | ||
|       *out = match;
 | ||
|       ++out;
 | ||
|       // Reset the match to a single item, which must be already in the map.
 | ||
|       match = prefix_to_code.find({kNoPrefix, *it})->second;
 | ||
|     } else {
 | ||
|       // Already known, use as the current match.
 | ||
|       match = ins.first->second;
 | ||
|     }
 | ||
|   }
 | ||
| 
 | ||
|   *out = match;
 | ||
|   ++out;
 | ||
| 
 | ||
|   return out;
 | ||
| }
 | ||
| 
 | ||
| template <class T, class ItIn, class ItOut>
 | ||
| ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
 | ||
|   if (begin == end)
 | ||
|     return out;
 | ||
| 
 | ||
|   // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
 | ||
|   InternalMmapVector<T> dict_len1(*begin);
 | ||
|   ++begin;
 | ||
| 
 | ||
|   if (begin == end)
 | ||
|     return out;
 | ||
| 
 | ||
|   for (auto& v : dict_len1) {
 | ||
|     v = *begin;
 | ||
|     ++begin;
 | ||
|   }
 | ||
| 
 | ||
|   // Substrings of len 2 and up. Indexes are shifted because [0,
 | ||
|   // dict_len1.size()) stored in dict_len1. Substings get here after being
 | ||
|   // emitted to the output, so we can use output position.
 | ||
|   InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
 | ||
|       code_to_substr;
 | ||
| 
 | ||
|   // Copies already emitted substrings into the output again.
 | ||
|   auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
 | ||
|     if (code < dict_len1.size()) {
 | ||
|       *out = dict_len1[code];
 | ||
|       ++out;
 | ||
|       return out;
 | ||
|     }
 | ||
|     const auto& s = code_to_substr[code - dict_len1.size()];
 | ||
| 
 | ||
|     for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
 | ||
|     return out;
 | ||
|   };
 | ||
| 
 | ||
|   // Returns lens of the substring with the given code.
 | ||
|   auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
 | ||
|     if (code < dict_len1.size())
 | ||
|       return 1;
 | ||
|     const auto& s = code_to_substr[code - dict_len1.size()];
 | ||
|     return s.second - s.first;
 | ||
|   };
 | ||
| 
 | ||
|   // Main LZW decoding loop.
 | ||
|   LzwCodeType prev_code = *begin;
 | ||
|   ++begin;
 | ||
|   out = copy(prev_code, out);
 | ||
|   for (auto it = begin; it != end; ++it) {
 | ||
|     LzwCodeType code = *it;
 | ||
|     auto start = out;
 | ||
|     if (code == dict_len1.size() + code_to_substr.size()) {
 | ||
|       // Special LZW case. The code is not in the dictionary yet. This is
 | ||
|       // possible only when the new substring is the same as previous one plus
 | ||
|       // the first item of the previous substring. We can emit that in two
 | ||
|       // steps.
 | ||
|       out = copy(prev_code, out);
 | ||
|       *out = *start;
 | ||
|       ++out;
 | ||
|     } else {
 | ||
|       out = copy(code, out);
 | ||
|     }
 | ||
| 
 | ||
|     // Every time encoded emits the code, it also creates substing of len + 1
 | ||
|     // including the first item of the just emmited substring. Do the same here.
 | ||
|     uptr len = code_to_len(prev_code);
 | ||
|     code_to_substr.push_back({start - len, start + 1});
 | ||
| 
 | ||
|     prev_code = code;
 | ||
|   }
 | ||
|   return out;
 | ||
| }
 | ||
| 
 | ||
| }  // namespace __sanitizer
 | ||
| #endif
 |