{ "id": "clustering", "type": "doc_clustering", "modelId": "doc_clustering_v1", "uidField": "id", "dataFormat": "solr", "trainingCollection": "searchhub", "fieldToVectorize": "body", "trainingDataFilterQuery": "*:*", "trainingDataSamplingPercentage": "1.0", "sourceFields": "", "analyzerConfig": "{ \"analyzers\": [ { \"name\": \"StdTokLowerStop\", \"charFilters\": [ { \"type\": \"htmlstrip\" } ], \"tokenizer\": { \"type\": \"letter\" }, \"filters\": [ { \"type\": \"lowercase\" }, { \"type\": \"length\", \"min\": \"3\", \"max\": \"32767\" }, { \"type\": \"stop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }, { \"type\": \"englishminimalstem\" }] }], \"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}", "clusteringMethod": "hierarchical", "clusterIdField": "cluster_id", "clusterLabelField": "cluster_label", "freqTermField": "freq_terms", "distToCenterField": "dist_to_center", "outputCollection": "searchhub_out", "minDF": "5.0", "maxDF": "0.75", "kExact": "0", "kMax": "10", "kMin": "2", "docLenTrim": "true", "outlierTrim": "true", "shortLen": "5.0", "longLen": "0.99", "numKeywordsPerLabel": "5", "randomSeed": "10", "outlierK": "10", "outlierThreshold": "0.01", "w2vDimension": "0", "w2vWindowSize": "8", "minDivisibleSize": "0.0", "kDiscount": "1.0" }