137 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			137 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C++
		
	
	
	
//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
 | 
						|
//
 | 
						|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
// See https://llvm.org/LICENSE.txt for license information.
 | 
						|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
//
 | 
						|
// This file implements fuzzy-matching of strings against identifiers.
 | 
						|
// It indicates both the existence and quality of a match:
 | 
						|
// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
 | 
						|
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
 | 
						|
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
 | 
						|
 | 
						|
#include "llvm/ADT/ArrayRef.h"
 | 
						|
#include "llvm/ADT/Optional.h"
 | 
						|
#include "llvm/ADT/SmallString.h"
 | 
						|
#include "llvm/ADT/StringRef.h"
 | 
						|
#include "llvm/Support/raw_ostream.h"
 | 
						|
 | 
						|
namespace clang {
 | 
						|
namespace clangd {
 | 
						|
 | 
						|
// Utilities for word segmentation.
 | 
						|
// FuzzyMatcher already incorporates this logic, so most users don't need this.
 | 
						|
//
 | 
						|
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
 | 
						|
// Aligning segmentation of word and pattern improves the fuzzy-match.
 | 
						|
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
 | 
						|
//
 | 
						|
// First we classify each character into types (uppercase, lowercase, etc).
 | 
						|
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
 | 
						|
 | 
						|
// We distinguish the types of characters that affect segmentation.
 | 
						|
// It's not obvious how to segment digits, we treat them as lowercase letters.
 | 
						|
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
 | 
						|
// This means we require exact (case-sensitive) match for those characters.
 | 
						|
enum CharType : unsigned char {
 | 
						|
  Empty = 0,       // Before-the-start and after-the-end (and control chars).
 | 
						|
  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
 | 
						|
  Upper = 2,       // Uppercase letters.
 | 
						|
  Punctuation = 3, // ASCII punctuation (including Space)
 | 
						|
};
 | 
						|
// A CharTypeSet is a bitfield representing all the character types in a word.
 | 
						|
// Its bits are 1<<Empty, 1<<Lower, etc.
 | 
						|
using CharTypeSet = unsigned char;
 | 
						|
 | 
						|
// Each character's Role is the Head or Tail of a segment, or a Separator.
 | 
						|
// e.g. XMLHttpRequest_Async
 | 
						|
//      +--+---+------ +----
 | 
						|
//      ^Head   ^Tail ^Separator
 | 
						|
enum CharRole : unsigned char {
 | 
						|
  Unknown = 0,   // Stray control characters or impossible states.
 | 
						|
  Tail = 1,      // Part of a word segment, but not the first character.
 | 
						|
  Head = 2,      // The first character of a word segment.
 | 
						|
  Separator = 3, // Punctuation characters that separate word segments.
 | 
						|
};
 | 
						|
 | 
						|
// Compute segmentation of Text.
 | 
						|
// Character roles are stored in Roles (Roles.size() must equal Text.size()).
 | 
						|
// The set of character types encountered is returned, this may inform
 | 
						|
// heuristics for dealing with poorly-segmented identifiers like "strndup".
 | 
						|
CharTypeSet calculateRoles(llvm::StringRef Text,
 | 
						|
                           llvm::MutableArrayRef<CharRole> Roles);
 | 
						|
 | 
						|
// A matcher capable of matching and scoring strings against a single pattern.
 | 
						|
// It's optimized for matching against many strings - match() does not allocate.
 | 
						|
class FuzzyMatcher {
 | 
						|
public:
 | 
						|
  // Characters beyond MaxPat are ignored.
 | 
						|
  FuzzyMatcher(llvm::StringRef Pattern);
 | 
						|
 | 
						|
  // If Word matches the pattern, return a score indicating the quality match.
 | 
						|
  // Scores usually fall in a [0,1] range, with 1 being a very good score.
 | 
						|
  // "Super" scores in (1,2] are possible if the pattern is the full word.
 | 
						|
  // Characters beyond MaxWord are ignored.
 | 
						|
  llvm::Optional<float> match(llvm::StringRef Word);
 | 
						|
 | 
						|
  llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
 | 
						|
  bool empty() const { return PatN == 0; }
 | 
						|
 | 
						|
  // Dump internal state from the last match() to the stream, for debugging.
 | 
						|
  // Returns the pattern with [] around matched characters, e.g.
 | 
						|
  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
 | 
						|
  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
 | 
						|
 | 
						|
private:
 | 
						|
  // We truncate the pattern and the word to bound the cost of matching.
 | 
						|
  constexpr static int MaxPat = 63, MaxWord = 127;
 | 
						|
  // Action describes how a word character was matched to the pattern.
 | 
						|
  // It should be an enum, but this causes bitfield problems:
 | 
						|
  //   - for MSVC the enum type must be explicitly unsigned for correctness
 | 
						|
  //   - GCC 4.8 complains not all values fit if the type is unsigned
 | 
						|
  using Action = bool;
 | 
						|
  constexpr static Action Miss = false; // Word character was skipped.
 | 
						|
  constexpr static Action Match = true; // Matched against a pattern character.
 | 
						|
 | 
						|
  bool init(llvm::StringRef Word);
 | 
						|
  void buildGraph();
 | 
						|
  bool allowMatch(int P, int W, Action Last) const;
 | 
						|
  int skipPenalty(int W, Action Last) const;
 | 
						|
  int matchBonus(int P, int W, Action Last) const;
 | 
						|
 | 
						|
  // Pattern data is initialized by the constructor, then constant.
 | 
						|
  char Pat[MaxPat];         // Pattern data
 | 
						|
  int PatN;                 // Length
 | 
						|
  char LowPat[MaxPat];      // Pattern in lowercase
 | 
						|
  CharRole PatRole[MaxPat]; // Pattern segmentation info
 | 
						|
  CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters
 | 
						|
  float ScoreScale;         // Normalizes scores for the pattern length.
 | 
						|
 | 
						|
  // Word data is initialized on each call to match(), mostly by init().
 | 
						|
  char Word[MaxWord];         // Word data
 | 
						|
  int WordN;                  // Length
 | 
						|
  char LowWord[MaxWord];      // Word in lowercase
 | 
						|
  CharRole WordRole[MaxWord]; // Word segmentation info
 | 
						|
  CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters
 | 
						|
  bool WordContainsPattern;   // Simple substring check
 | 
						|
 | 
						|
  // Cumulative best-match score table.
 | 
						|
  // Boundary conditions are filled in by the constructor.
 | 
						|
  // The rest is repopulated for each match(), by buildGraph().
 | 
						|
  struct ScoreInfo {
 | 
						|
    signed int Score : 15;
 | 
						|
    Action Prev : 1;
 | 
						|
  };
 | 
						|
  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
 | 
						|
};
 | 
						|
 | 
						|
} // namespace clangd
 | 
						|
} // namespace clang
 | 
						|
 | 
						|
#endif
 |